From f17ce28b6d169eea89b3df368ed962fc564d0415 Mon Sep 17 00:00:00 2001 From: theli Date: Tue, 3 Oct 2006 11:05:48 +0000 Subject: [PATCH] *) plasmaHTCache: - method loadResourceContent defined as deprecated. Please do not use this function to avoid OutOfMemory Exceptions when loading large files - new function getResourceContentStream to get an inputstream of a cache file - new function getResourceContentLength to get the size of a cached file *) httpc.java: - Bugfix: resource content was loaded into memory even if this was not requested *) Crawler: - new option to hold loaded resource content in memory - adding option to use the worker class without the worker pool (needed by the snippet fetcher) *) plasmaSnippetCache - snippet loader does not use a crawl-worker from pool but uses a newly created instance to avoid blocking by normal crawling activity. - now operates on streams instead of byte arrays to avoid OutOfMemory Exceptions when operating on large files - snippet loader now forces the crawl-worker to keep the loaded resource in memory to avoid IO *) plasmaCondenser: adding new function getWords that can directly operate on input streams *) Parsers - keep resource in memory whenever possible (to avoid IO) - when parsing from stream the content length must be passed to the parser function now. this length value is needed by the parsers to decide if the parsed resource content is to large to hold it in memory and must be stored to file - AbstractParser.java: new function to pass the contentLength of a resource to the parsers git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2701 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ViewFile.java | 60 ++++-- htroot/ViewImage.java | 20 +- source/de/anomic/http/httpc.java | 14 +- source/de/anomic/http/httpdProxyHandler.java | 2 +- .../plasma/crawler/AbstractCrawlWorker.java | 29 ++- .../plasma/crawler/http/CrawlWorker.java | 5 +- .../plasma/crawler/plasmaCrawlerFactory.java | 10 +- .../plasma/crawler/plasmaCrawlerPool.java | 7 + .../anomic/plasma/parser/AbstractParser.java | 17 +- source/de/anomic/plasma/parser/Parser.java | 2 + .../anomic/plasma/parser/bzip/bzipParser.java | 4 +- .../anomic/plasma/parser/doc/docParser.java | 1 + .../anomic/plasma/parser/gzip/gzipParser.java | 4 +- .../parser/mimeType/mimeTypeParser.java | 4 +- .../anomic/plasma/parser/odt/odtParser.java | 4 +- .../anomic/plasma/parser/pdf/pdfParser.java | 5 +- .../anomic/plasma/parser/rpm/rpmParser.java | 4 +- .../anomic/plasma/parser/rss/rssParser.java | 4 +- .../anomic/plasma/parser/rtf/rtfParser.java | 1 + .../anomic/plasma/parser/tar/tarParser.java | 6 +- .../anomic/plasma/parser/vcf/vcfParser.java | 4 +- .../anomic/plasma/parser/zip/zipParser.java | 6 +- source/de/anomic/plasma/plasmaCondenser.java | 9 +- .../de/anomic/plasma/plasmaCrawlLoader.java | 27 ++- .../plasma/plasmaCrawlLoaderMessage.java | 5 +- source/de/anomic/plasma/plasmaHTCache.java | 40 +++- source/de/anomic/plasma/plasmaParser.java | 168 ++++++++-------- .../de/anomic/plasma/plasmaSearchImages.java | 10 +- .../de/anomic/plasma/plasmaSnippetCache.java | 189 ++++++++++++------ .../de/anomic/plasma/plasmaSwitchboard.java | 22 +- source/de/anomic/server/serverFileUtils.java | 2 +- 31 files changed, 465 insertions(+), 220 deletions(-) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index b1f12ef9e..525c9d7e4 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -45,6 +45,7 @@ //if the shell's current path is HTROOT import java.io.IOException; +import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; @@ -60,6 +61,7 @@ import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.crawler.plasmaCrawlerException; import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.plasmaCrawlLURL.Entry; +import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -121,18 +123,20 @@ public class ViewFile { } // loading the resource content as byte array - byte[] resource = null; + InputStream resource = null; + long resourceLength = -1; IResourceInfo resInfo = null; String resMime = null; try { // trying to load the resource body - resource = sb.cacheManager.loadResourceContent(url); + resource = sb.cacheManager.getResourceContentStream(url); + resourceLength = sb.cacheManager.getResourceContentLength(url); // if the resource body was not cached we try to load it from web if (resource == null) { plasmaHTCache.Entry entry = null; try { - entry = sb.snippetCache.loadResourceFromWeb(url, 5000); + entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false); } catch (plasmaCrawlerException e) { prop.put("error",4); prop.put("error_errorText",e.getMessage()); @@ -142,11 +146,13 @@ public class ViewFile { if (entry != null) { resInfo = entry.getDocumentInfo(); - resource = sb.cacheManager.loadResourceContent(url); + resource = sb.cacheManager.getResourceContentStream(url); + resourceLength = sb.cacheManager.getResourceContentLength(url); } if (resource == null) { prop.put("error",4); + prop.put("error_errorText","No resource available"); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } @@ -172,21 +178,46 @@ public class ViewFile { httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig); if (responseHeader == null) { prop.put("error",4); + prop.put("error_errorText","Unable to load resource metadata."); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } + try { + resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader); + } catch (Exception e) { + prop.put("error",4); + prop.put("error_errorText",e.getMessage()); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } resMime = responseHeader.mime(); } } else { resMime = resInfo.getMimeType(); } } catch (IOException e) { + if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */} prop.put("error",4); + prop.put("error_errorText",e.getMessage()); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; - } - if (viewMode.equals("plain")) { - String content = new String(resource); + } + + if (viewMode.equals("plain")) { + + // TODO: how to handle very large files here ? + String content; + try { + content = new String(serverFileUtils.read(resource),"UTF-8"); + } catch (Exception e) { + prop.put("error",4); + prop.put("error_errorText",e.getMessage()); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } finally { + if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */} + } + content = content.replaceAll("<","<") .replaceAll(">",">") .replaceAll("\"",""") @@ -195,12 +226,15 @@ public class ViewFile { prop.put("error",0); prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); - prop.put("viewMode_plainText",content); - } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) { + prop.put("viewMode_plainText",content); + } else if (viewMode.equals("iframe")) { + prop.put("viewMode",VIEW_MODE_AS_IFRAME); + prop.put("viewMode_url",url.toString()); + } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) { // parsing the resource content plasmaParserDocument document = null; try { - document = sb.snippetCache.parseDocument(url, resource,resInfo); + document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo); if (document == null) { prop.put("error",5); prop.put("error_errorText","Unknown error"); @@ -212,7 +246,10 @@ public class ViewFile { prop.put("error_errorText",e.getMessage()); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; + } finally { + if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */} } + resMime = document.getMimeType(); if (viewMode.equals("parsed")) { @@ -223,9 +260,6 @@ public class ViewFile { prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); prop.put("viewMode_parsedText",content); - } else if (viewMode.equals("iframe")) { - prop.put("viewMode",VIEW_MODE_AS_IFRAME); - prop.put("viewMode_url",url.toString()); } else { prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); String[] sentences = document.getSentences(); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 1d1329873..30d765ee2 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -43,11 +43,14 @@ import java.awt.Container; import java.awt.Image; import java.awt.MediaTracker; import java.awt.Toolkit; +import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import de.anomic.http.httpHeader; import de.anomic.net.URL; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -70,9 +73,20 @@ public class ViewImage { int maxheight = post.getInt("maxheight", 0); int timeout = post.getInt("timeout", 5000); - // load image - byte[] imgb = sb.snippetCache.getResource(url, true, timeout); - if (imgb == null) return null; + // getting the image as stream + InputStream imgStream = (InputStream) sb.snippetCache.getResource(url, true, timeout)[0]; + if (imgStream == null) return null; + + // read image data + byte[] imgb = null; + try { + imgb = serverFileUtils.read(imgStream); + } catch (IOException e) { + return null; + } finally { + try { imgStream.close(); } catch (Exception e) {/* ignore this */} + } + // create image MediaTracker mediaTracker = new MediaTracker(new Container()); diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index ee910c023..0a5656058 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -1828,7 +1828,7 @@ do upload // return sbb.getBytes(); return serverFileUtils.read(this.getContentInputStream()); } - + /** * This method outputs the found content into an byte-array and * additionally outputs it to procOS. @@ -1837,9 +1837,13 @@ do upload * @return * @throws IOException */ - public byte[] writeContent(Object procOS) throws IOException { - int contentLength = (int) this.responseHeader.contentLength(); - serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength); + public byte[] writeContent(Object procOS, boolean returnByteArray) throws IOException { + serverByteBuffer sbb = null; + + if (returnByteArray) { + int contentLength = (int) this.responseHeader.contentLength(); + sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength); + } if (procOS instanceof OutputStream) { //writeContentX(httpc.this.clientInput, this.gzip, this.responseHeader.contentLength(), procOS, sbb); @@ -1852,7 +1856,7 @@ do upload throw new IllegalArgumentException("Invalid procOS object type '" + procOS.getClass().getName() + "'"); } - return sbb.getBytes(); + return (sbb==null)?null:sbb.getBytes(); } /** diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index ea7edfbcb..441bdef2c 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -662,7 +662,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt if ((contentLength > 0) && (contentLength < 1048576)) // if the length is known and < 1 MB { // ok, we don't write actually into a file, only to RAM, and schedule writing the file. - byte[] cacheArray = res.writeContent(hfos); + byte[] cacheArray = res.writeContent(hfos,true); this.theLogger.logFine("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length))); if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).finalize(); diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java index 2df4f4d4b..6960ea857 100644 --- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java @@ -80,6 +80,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW */ protected boolean done = false; + /* ============================================================ * Crawl job specific variables * ============================================================ */ @@ -92,6 +93,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW protected long startdate; protected plasmaCrawlProfile.entry profile; protected boolean acceptAllContent; + protected boolean keepInMemory; protected String errorMessage; @@ -159,22 +161,27 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW try { // The thread keeps running. - while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) { - if (this.done) { - synchronized (this) { - // return thread back into pool - this.myPool.returnObject(this.protocol,this); - - // We are waiting for a new task now. - if (!this.stopped && !this.destroyed && !this.isInterrupted()) { - this.wait(); + while (!this.stopped && !this.isInterrupted()) { + if (this.done) { + if (this.myPool != null && !this.myPool.isClosed) { + synchronized (this) { + // return thread back into pool + this.myPool.returnObject(this.protocol,this); + + // We are waiting for a new task now. + if (!this.stopped && !this.destroyed && !this.isInterrupted()) { + this.wait(); + } } + } else { + this.stopped = true; } } else { try { // executing the new task execute(); } finally { + // free memory reset(); } } @@ -231,6 +238,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW this.depth = theNewMsg.depth; this.profile = theNewMsg.profile; this.acceptAllContent = theNewMsg.acceptAllContent; + this.keepInMemory = theNewMsg.keepInMemory; this.startdate = System.currentTimeMillis(); @@ -260,6 +268,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW public void reset() { this.theMsg = null; + this.url = null; this.name = null; this.refererURLString = null; @@ -268,6 +277,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW this.startdate = 0; this.profile = null; this.acceptAllContent = false; + this.keepInMemory = false; + this.errorMessage = null; } diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index 54c1a8a60..ebb064048 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -262,8 +262,9 @@ public final class CrawlWorker extends AbstractCrawlWorker { } // we write the new cache entry to file system directly - res.writeContent(fos); - htCache.setCacheArray(null); + byte[] cacheArray = null; + cacheArray = res.writeContent(fos,this.keepInMemory); + htCache.setCacheArray(cacheArray); this.cacheManager.writeFileAnnouncement(cacheFile); } finally { if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */} diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java index 7f4f229ee..6f974957d 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java @@ -84,10 +84,14 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory { this.thePool = pool; } + public Object makeObject(Object key) throws Exception { + return makeObject(key, true); + } + /** * @see org.apache.commons.pool.PoolableObjectFactory#makeObject() */ - public Object makeObject(Object key) throws Exception { + public Object makeObject(Object key, boolean usePool) throws Exception { if (!(key instanceof String)) throw new IllegalArgumentException("The object key must be of type string."); @@ -109,11 +113,11 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory { // instantiating class plasmaCrawlWorker theCrawlWorker = (plasmaCrawlWorker) classConstructor.newInstance(new Object[] { this.theThreadGroup, - this.thePool, + (usePool)?this.thePool:null, this.sb, this.cacheManager, this.theLog - }); + }); // return the newly created object return theCrawlWorker; diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java index f69845901..7b52106ee 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java @@ -52,15 +52,22 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool; import de.anomic.server.logging.serverLog; public final class plasmaCrawlerPool extends GenericKeyedObjectPool { + + private plasmaCrawlerFactory theFactory; private final ThreadGroup theThreadGroup; public boolean isClosed = false; public plasmaCrawlerPool(plasmaCrawlerFactory objFactory, GenericKeyedObjectPool.Config config, ThreadGroup threadGroup) { super(objFactory, config); + this.theFactory = objFactory; this.theThreadGroup = threadGroup; objFactory.setPool(this); } + public plasmaCrawlerFactory getFactory() { + return this.theFactory; + } + public Object borrowObject(Object key) throws Exception { return super.borrowObject(key); } diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java index 2c7f1d701..baa413a06 100644 --- a/source/de/anomic/plasma/parser/AbstractParser.java +++ b/source/de/anomic/plasma/parser/AbstractParser.java @@ -90,7 +90,7 @@ public abstract class AbstractParser implements Parser{ * The source file file size in bytes if the source document was passed * in as file */ - protected long fileSize = -1; + protected long contentLength = -1; /** * The Constructor of this class. @@ -99,6 +99,15 @@ public abstract class AbstractParser implements Parser{ super(); this.libxDependencies = libxDependencies; } + + /** + * Set the content length of the source file. + * This value is needed by some parsers to decide + * if the parsed text could be hold in memory + */ + public void setContentLength(long length) { + this.contentLength = length; + } /** * Check if the parser was interrupted. @@ -185,7 +194,7 @@ public abstract class AbstractParser implements Parser{ BufferedInputStream contentInputStream = null; try { // getting the file size of the document - this.fileSize = sourceFile.length(); + this.contentLength = sourceFile.length(); // create a stream from the file contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); @@ -242,4 +251,8 @@ public abstract class AbstractParser implements Parser{ public String getName() { return this.parserName; } + + public void reset() { + this.contentLength = -1; + } } diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java index 83d0daa5c..a1adeae06 100644 --- a/source/de/anomic/plasma/parser/Parser.java +++ b/source/de/anomic/plasma/parser/Parser.java @@ -117,6 +117,8 @@ public interface Parser { */ public void reset(); + public void setContentLength(long length); + /** * @return Returns a list of library names that are needed by this parser */ diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java index 60621e7f8..53b2630dd 100644 --- a/source/de/anomic/plasma/parser/bzip/bzipParser.java +++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java @@ -138,7 +138,7 @@ public class bzipParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index b5a076399..92c116b4c 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -118,6 +118,7 @@ implements Parser { public void reset() { // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java index 0c2af76b3..a289eb361 100644 --- a/source/de/anomic/plasma/parser/gzip/gzipParser.java +++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java @@ -122,7 +122,7 @@ public class gzipParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java index 38665c6c5..70b01f471 100644 --- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java +++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java @@ -187,8 +187,7 @@ implements Parser { } } - public plasmaParserDocument parse(URL location, String mimeType,String charset, - InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType,String charset, InputStream source) throws ParserException, InterruptedException { File dstFile = null; try { dstFile = File.createTempFile("mimeTypeParser",".tmp"); @@ -208,6 +207,7 @@ implements Parser { public void reset() { // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index 9d8e9e011..6fc977644 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -197,8 +197,8 @@ public class odtParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 5f2fca420..174d8fbd9 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -132,7 +132,7 @@ public class pdfParser extends AbstractParser implements Parser { } // creating a writer for output - if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { writerFile = File.createTempFile("pdfParser",".tmp"); writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); } else { @@ -199,7 +199,8 @@ public class pdfParser extends AbstractParser implements Parser { } public void reset() { - this.fileSize = -1; + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 6c52cb97c..90ee23222 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -169,8 +169,8 @@ public class rpmParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index 41cf8573b..dbf3d11ee 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -211,8 +211,8 @@ public class rssParser extends AbstractParser implements Parser { } public void reset() { - // TODO Auto-generated method stub - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index c054f079e..de5e3ff72 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -124,6 +124,7 @@ implements Parser { public void reset() { // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 4d3ff6860..4f066232a 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -104,7 +104,7 @@ public class tarParser extends AbstractParser implements Parser { File outputFile = null; plasmaParserDocument subDoc = null; try { - if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { outputFile = File.createTempFile("zipParser",".tmp"); docText = new BufferedOutputStream(new FileOutputStream(outputFile)); } else { @@ -251,7 +251,7 @@ public class tarParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index e31010537..f553d5032 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -262,8 +262,8 @@ public class vcfParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index 8a523dbcf..e672df7dd 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -102,7 +102,7 @@ public class zipParser extends AbstractParser implements Parser { File outputFile = null; plasmaParserDocument subDoc = null; try { - if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { outputFile = File.createTempFile("zipParser",".tmp"); docText = new BufferedOutputStream(new FileOutputStream(outputFile)); } else { @@ -235,7 +235,7 @@ public class zipParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 6ca5bfc63..d72eb43f8 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -671,11 +671,16 @@ public final class plasmaCondenser { } */ + public static Iterator getWords(InputStream input) { + if (input == null) return null; + plasmaCondenser condenser = new plasmaCondenser(input); + return condenser.words(); + } + public static Iterator getWords(byte[] text) { if (text == null) return null; ByteArrayInputStream buffer = new ByteArrayInputStream(text); - plasmaCondenser condenser = new plasmaCondenser(buffer); - return condenser.words(); + return getWords(buffer); } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index e349da8bf..eac42b16e 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -163,15 +163,23 @@ public final class plasmaCrawlLoader extends Thread { return this.theThreadGroup; } - private void execute(plasmaCrawlLoaderMessage theMsg) throws Exception { + private void execute(plasmaCrawlLoaderMessage theMsg, boolean useThreadPool) throws Exception { // getting the protocol of the next URL String protocol = theMsg.url.getProtocol(); // TODO: remove this if (protocol.equals("https")) protocol = "http"; - // getting a new crawler from the crawler pool - plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol); + // get a new worker thread + plasmaCrawlWorker theWorker = null; + if (useThreadPool) { + // getting a new crawler from the crawler pool + theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol); + } else { + // create a new one + theWorker = (plasmaCrawlWorker) this.crawlwerPool.getFactory().makeObject(protocol,false); + } + if (theWorker == null) { this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + theMsg.url); } else { @@ -187,7 +195,7 @@ public final class plasmaCrawlLoader extends Thread { plasmaCrawlLoaderMessage theMsg = this.theQueue.waitForMessage(); // start new crawl job - this.execute(theMsg); + this.execute(theMsg, true); } catch (InterruptedException e) { Thread.interrupted(); @@ -218,7 +226,8 @@ public final class plasmaCrawlLoader extends Thread { String initiator, int depth, plasmaCrawlProfile.entry profile, - int timeout + int timeout, + boolean keepInMemory ) throws plasmaCrawlerException { plasmaHTCache.Entry result = null; @@ -235,13 +244,14 @@ public final class plasmaCrawlLoader extends Thread { profile, crawlingPriority, true, - timeout + timeout, + keepInMemory ); try { // start new crawl job - this.execute(theMsg); + this.execute(theMsg, false); // wait for the crawl job result result = theMsg.waitForResult(); @@ -283,7 +293,8 @@ public final class plasmaCrawlLoader extends Thread { profile, // crawling profile crawlingPriority, // crawling priority false, // only download documents whose mimetypes are enabled for the crawler - -1 // use default crawler timeout + -1, // use default crawler timeout + false // resource should not be kept in memory ); // adding the message to the queue diff --git a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java index b3d678c67..60929d606 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java +++ b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java @@ -56,6 +56,7 @@ public final class plasmaCrawlLoaderMessage { public final plasmaCrawlProfile.entry profile; public final boolean acceptAllContent; public final int timeout; + public final boolean keepInMemory; private serverSemaphore resultSync = null; private plasmaHTCache.Entry result; @@ -71,7 +72,8 @@ public final class plasmaCrawlLoaderMessage { plasmaCrawlProfile.entry profile, int crawlingPriority, boolean acceptAllContent, - int timeout + int timeout, + boolean keepInMemory ) { this.url = url; this.name = name; @@ -82,6 +84,7 @@ public final class plasmaCrawlLoaderMessage { this.crawlingPriority = crawlingPriority; this.acceptAllContent = acceptAllContent; this.timeout = timeout; + this.keepInMemory = keepInMemory; this.resultSync = new serverSemaphore(0); this.result = null; diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 293acdc28..17d34d372 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -53,9 +53,12 @@ package de.anomic.plasma; +import java.io.BufferedInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.net.InetAddress; import java.net.MalformedURLException; import java.util.Date; @@ -701,16 +704,51 @@ public final class plasmaHTCache { return null; } + /** + * @param url + * @return + * + * @deprecated dont't use this function to avoid OutOfMemory-Exceptions. + * Use {@link #getResourceContentStream(URL)} instead + */ public byte[] loadResourceContent(URL url) { // load the url as resource from the cache File f = getCachePath(url); - if (f.exists()) try { + if (f.exists() && f.canRead()) try { return serverFileUtils.read(f); } catch (IOException e) { return null; } return null; } + + /** + * Returns the content of a cached resource as {@link InputStream} + * @param url the requested resource + * @return the resource content as {@link InputStream}. In no data + * is available or the cached file is not readable, null + * is returned. + */ + public InputStream getResourceContentStream(URL url) { + // load the url as resource from the cache + File f = getCachePath(url); + if (f.exists() && f.canRead()) try { + return new BufferedInputStream(new FileInputStream(f)); + } catch (IOException e) { + this.log.logSevere("Unable to create a BufferedInputStream from file " + f,e); + return null; + } + return null; + } + + public long getResourceContentLength(URL url) { + // load the url as resource from the cache + File f = getCachePath(url); + if (f.exists() && f.canRead()) { + return f.length(); + } + return 0; + } public static boolean isPOST(String urlString) { return (urlString.indexOf("?") >= 0 || diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index e339bc0fe..b420b7ffc 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -45,11 +45,13 @@ package de.anomic.plasma; import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import java.net.URI; import java.util.Arrays; @@ -465,16 +467,25 @@ public final class plasmaParser { } catch (Exception e) {/* ignore this */} } - public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) + public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] sourceArray) throws InterruptedException, ParserException { - File tempFile = null; + ByteArrayInputStream byteIn = null; try { - // creating a temp file to store the byte array - tempFile = File.createTempFile("parseSource", ".tmp"); - serverFileUtils.write(source, tempFile); + if (this.theLogger.isFine()) + this.theLogger.logFine("Parsing '" + location + "' from byte-array"); + + // testing if the resource is not empty + if (sourceArray == null || sourceArray.length == 0) { + String errorMsg = "No resource content available."; + this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); + } + + // creating an InputStream + byteIn = new ByteArrayInputStream(sourceArray); // parsing the temp file - return parseSource(location, mimeType, charset, tempFile); + return parseSource(location, mimeType, charset, sourceArray.length, byteIn); } catch (Exception e) { // Interrupted- and Parser-Exceptions should pass through @@ -482,20 +493,65 @@ public final class plasmaParser { if (e instanceof ParserException) throw (ParserException) e; // log unexpected error - this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e); + this.theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e); throw new ParserException("Unexpected exception while parsing " + location,location, e); } finally { - if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */} + if (byteIn != null) try { byteIn.close(); } catch (Exception ex){/* ignore this */} } } - public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) - throws InterruptedException, ParserException { + public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) throws InterruptedException, ParserException { + + BufferedInputStream sourceStream = null; + try { + if (this.theLogger.isFine()) + this.theLogger.logFine("Parsing '" + location + "' from file"); + + // testing if the resource is not empty + if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { + String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available."; + this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); + } + + // create a new InputStream + sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); + + // parsing the data + return this.parseSource(location, theMimeType, theDocumentCharset, sourceFile.length(), sourceStream); + + } catch (Exception e) { + // Interrupted- and Parser-Exceptions should pass through + if (e instanceof InterruptedException) throw (InterruptedException) e; + if (e instanceof ParserException) throw (ParserException) e; + // log unexpected error + this.theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); + throw new ParserException("Unexpected exception while parsing " + location,location, e); + } finally { + if (sourceStream != null) try { sourceStream.close(); } catch (Exception ex){/* ignore this */} + } + } + + /** + * To parse a resource from an {@link InputStream} + * @param location the URL of the resource + * @param theMimeType the resource mimetype (null if unknown) + * @param theDocumentCharset the charset of the resource (null if unknown) + * @param contentLength the content length of the resource (-1 if unknown) + * @param sourceStream an {@link InputStream} containing the resource body + * @return the parsed {@link plasmaParserDocument document} + * @throws InterruptedException + * @throws ParserException + */ + public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, long contentLength, InputStream sourceStream) throws InterruptedException, ParserException { Parser theParser = null; String mimeType = null; try { + if (this.theLogger.isFine()) + this.theLogger.logFine("Parsing '" + location + "' from stream"); + // getting the mimetype of the document mimeType = getRealMimeType(theMimeType); @@ -513,66 +569,9 @@ public final class plasmaParser { throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT); } - // testing if the resource is not empty - if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { - String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available."; - this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); - } - - if (this.theLogger.isFine()) this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + - "' and file extension '" + fileExt + "'."); - - /* - * There are some problematic mimeType - fileExtension combination where we have to enforce - * a mimeType detection to get the proper parser for the content - * - * - application/zip + .odt - * - text/plain + .odt - * - text/plain + .vcf - * - text/xml + .rss - * - text/xml + .atom - * - * In all these cases we can trust the fileExtension and have to determine the proper mimeType. - * - */ - -// // Handling of not trustable mimeTypes -// // - text/plain -// // - text/xml -// // - application/octet-stream -// // - application/zip -// if ( -// (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) || -// (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt")) -// ) { -// if (this.theLogger.isFine()) -// this.theLogger.logFine("Document " + location + " has an mimeType '" + mimeType + -// "' that seems not to be correct for file extension '" + fileExt + "'."); -// -// if (enabledParserList.containsKey("application/octet-stream")) { -// theParser = this.getParser("application/octet-stream"); -// Object newMime = theParser.getClass().getMethod("getMimeType", new Class[]{File.class}).invoke(theParser, sourceFile); -// if (newMime == null) -// if (newMime instanceof String) { -// String newMimeType = (String)newMime; -// if ((newMimeType.equals("application/octet-stream")) { -// return null; -// } -// mimeType = newMimeType; -// } -// } else { -// return null; -// } -// } else if (mimeType.equalsIgnoreCase("application/zip") && fileExt.equalsIgnoreCase("odt")){ -// if (enabledParserList.containsKey("application/vnd.oasis.opendocument.text")) { -// mimeType = "application/vnd.oasis.opendocument.text"; -// } else { -// return null; -// } -// } + "' and file extension '" + fileExt + "'."); // getting the correct parser for the given mimeType theParser = this.getParser(mimeType); @@ -580,9 +579,12 @@ public final class plasmaParser { // if a parser was found we use it ... plasmaParserDocument doc = null; if (theParser != null) { - doc = theParser.parse(location, mimeType,documentCharset,sourceFile); + // set the content length of the resource + theParser.setContentLength(contentLength); + // parse the resource + doc = theParser.parse(location, mimeType,documentCharset,sourceStream); } else if (realtimeParsableMimeTypesContains(mimeType)) { - doc = parseHtml(location, mimeType, documentCharset, sourceFile); + doc = parseHtml(location, mimeType, documentCharset, sourceStream); } else { String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); @@ -611,14 +613,13 @@ public final class plasmaParser { if (theParser != null) { try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { /* ignore this */} } - } + } } - private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException { + private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, InputStream sourceStream) throws IOException, ParserException { // ...otherwise we make a scraper and transformer - FileInputStream fileIn = new FileInputStream(sourceFile); - htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false); + htmlFilterInputStream htmlFilter = new htmlFilterInputStream(sourceStream,documentCharset,location,null,false); String charset = htmlFilter.detectCharset(); if (charset == null) { charset = documentCharset; @@ -763,7 +764,7 @@ public final class plasmaParser { //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out try { - File contentFile = null; + Object content = null; URL contentURL = null; String contentMimeType = "application/octet-stream"; String charSet = "UTF-8"; @@ -774,17 +775,13 @@ public final class plasmaParser { String mode = args[0]; if (mode.equalsIgnoreCase("-f")) { - contentFile = new File(args[1]); - contentURL = new URL(contentFile); + content = new File(args[1]); + contentURL = new URL((File)content); } else if (mode.equalsIgnoreCase("-u")) { contentURL = new URL(args[1]); // downloading the document content - byte[] contentBytes = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null); - - contentFile = File.createTempFile("content",".tmp"); - contentFile.deleteOnExit(); - serverFileUtils.write(contentBytes, contentFile); + content = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null); } if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) { @@ -805,7 +802,12 @@ public final class plasmaParser { plasmaParser.enableAllParsers(PARSER_MODE_PROXY); // parsing the content - plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile); + plasmaParserDocument document = null; + if (content instanceof byte[]) { + document = theParser.parseSource(contentURL, contentMimeType, charSet, (byte[])content); + } else if (content instanceof File) { + document = theParser.parseSource(contentURL, contentMimeType, charSet, (File)content); + } // printing out all parsed sentences if (document != null) { diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index 129302433..d6ea1bd9d 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -41,6 +41,7 @@ package de.anomic.plasma; +import java.io.InputStream; import java.net.MalformedURLException; import java.util.Iterator; import java.util.Map; @@ -59,13 +60,18 @@ public final class plasmaSearchImages { long start = System.currentTimeMillis(); this.images = new TreeSet(); if (maxTime > 10) { - byte[] res = sc.getResource(url, true, (int) maxTime); + Object[] resource = sc.getResource(url, true, (int) maxTime); + InputStream res = (InputStream) resource[0]; + Long resLength = (Long) resource[1]; if (res != null) { plasmaParserDocument document = null; try { - document = sc.parseDocument(url, res); + // parse the document + document = sc.parseDocument(url, resLength.longValue(), res); } catch (ParserException e) { // parsing failed + } finally { + try { res.close(); } catch (Exception e) {/* ignore this */} } if (document == null) return; diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index d2a1f6864..80a63a1a1 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -44,7 +44,9 @@ package de.anomic.plasma; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; @@ -187,46 +189,62 @@ public class plasmaSnippetCache { * LOADING RESOURCE DATA * =========================================================================== */ // if the snippet is not in the cache, we can try to get it from the htcache - byte[] resource = null; - IResourceInfo docInfo = null; + long resContentLength = 0; + InputStream resContent = null; + IResourceInfo resInfo = null; try { // trying to load the resource from the cache - resource = this.cacheManager.loadResourceContent(url); + resContent = this.cacheManager.getResourceContentStream(url); + if (resContent != null) { + // if the content was found + resContentLength = this.cacheManager.getResourceContentLength(url); + + // getting resource metadata + resInfo = this.cacheManager.loadResourceInfo(url); - // if not found try to download it - if ((resource == null) && (fetchOnline)) { - // download resource using the crawler - plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout); + } else if (fetchOnline) { + // if not found try to download it - // getting resource metadata (e.g. the http headers for http resources) - if (entry != null) docInfo = entry.getDocumentInfo(); + // download resource using the crawler and keep resource in memory if possible + plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true); - // read resource body (if it is there) - resource = entry.cacheArray(); + // getting resource metadata (e.g. the http headers for http resources) + if (entry != null) { + resInfo = entry.getDocumentInfo(); + + // read resource body (if it is there) + byte []resourceArray = entry.cacheArray(); + if (resourceArray != null) { + resContent = new ByteArrayInputStream(resourceArray); + resContentLength = resourceArray.length; + } else { + resContent = this.cacheManager.getResourceContentStream(url); + resContentLength = this.cacheManager.getResourceContentLength(url); + } + } - // in case that the reosurce was not in ram, read it from disk - if (resource == null) resource = this.cacheManager.loadResourceContent(url); + // if it is still not available, report an error + if (resContent == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL"); - // if it is still not available, throw exception - if (resource == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL"); - source = SOURCE_WEB; + } else { + return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available"); } } catch (Exception e) { if (!(e instanceof plasmaCrawlerException)) e.printStackTrace(); return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage()); - } + } - if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available"); - /* =========================================================================== * PARSING RESOURCE * =========================================================================== */ plasmaParserDocument document = null; try { - document = parseDocument(url, resource, docInfo); + document = parseDocument(url, resContentLength, resContent, resInfo); } catch (ParserException e) { return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed + } finally { + try { resContent.close(); } catch (Exception e) {/* ignore this */} } if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed @@ -263,30 +281,40 @@ public class plasmaSnippetCache { * @return the parsed document as {@link plasmaParserDocument} */ public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) { - byte[] resource = null; IResourceInfo docInfo = null; try { // trying to load the resource body from cache - resource = this.cacheManager.loadResourceContent(url); + InputStream content = this.cacheManager.getResourceContentStream(url); + long resourceLength = this.cacheManager.getResourceContentLength(url); // if not available try to load resource from web - if ((fetchOnline) && (resource == null)) { + if ((fetchOnline) && (content == null)) { // download resource using crawler - plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); + plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000, true); // fetching metadata of the resource (e.g. http headers for http resource) - if (entry != null) docInfo = entry.getDocumentInfo(); - - // getting the resource body from the cache - resource = this.cacheManager.loadResourceContent(url); + if (entry != null) { + docInfo = entry.getDocumentInfo(); + + byte[] resourceArray = entry.cacheArray(); + if (resourceArray != null) { + // read resource body (if it is there) + content = new ByteArrayInputStream(resourceArray); + resourceLength = resourceArray.length; + } else { + // in case that the reosurce was not in ram, read it from disk + content = this.cacheManager.getResourceContentStream(url); + resourceLength = this.cacheManager.getResourceContentLength(url); + } + } } else { // trying to load resource metadata docInfo = this.cacheManager.loadResourceInfo(url); } // parsing document - if (resource == null) return null; - return parseDocument(url, resource, docInfo); + if (content == null) return null; + return parseDocument(url, resourceLength, content, docInfo); } catch (ParserException e) { this.log.logWarning("Unable to parse resource. " + e.getMessage()); return null; @@ -446,15 +474,24 @@ public class plasmaSnippetCache { return map; } - public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException { - return parseDocument(url, resource, null); + public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream) throws ParserException { + return parseDocument(url, contentLength, resourceStream, null); } - public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException { + /** + * Parse the resource + * @param url the URL of the resource + * @param contentLength the contentLength of the resource + * @param resourceStream the resource body as stream + * @param docInfo metadata about the resource + * @return the extracted data + * @throws ParserException + */ + public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream, IResourceInfo docInfo) throws ParserException { try { - if (resource == null) return null; + if (resourceStream == null) return null; - // if no resource metadata is available, try to load it + // STEP 1: if no resource metadata is available, try to load it from cache if (docInfo == null) { // try to get the header from the htcache directory try { @@ -464,18 +501,21 @@ public class plasmaSnippetCache { } } + // STEP 2: if the metadata is still null try to download it from web + if ((docInfo == null) && (url.getProtocol().startsWith("http"))) { // TODO: we need a better solution here - // encapsulate this in the crawlLoader class - if ((docInfo == null) && (url.getProtocol().startsWith("http"))) { - // getting URL mimeType - try { - httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig); - docInfo = this.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, header); - } catch (Exception e) { - // ingore this. http header download failed - } - } + // e.g. encapsulate this in the crawlLoader class + + // getting URL mimeType + try { + httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig); + docInfo = this.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, header); + } catch (Exception e) { + // ingore this. http header download failed + } + } + // STEP 3: if the metadata is still null try to guess the mimeType of the resource if (docInfo == null) { String filename = this.cacheManager.getCachePath(url).getName(); int p = filename.lastIndexOf('.'); @@ -495,12 +535,12 @@ public class plasmaSnippetCache { supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1)); } - return this.parser.parseSource(url, supposedMime, null, resource); + return this.parser.parseSource(url, supposedMime, null, contentLength, resourceStream); } return null; - } + } if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) { - return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), resource); + return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), contentLength, resourceStream); } return null; } catch (InterruptedException e) { @@ -509,27 +549,57 @@ public class plasmaSnippetCache { } } - public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) { + /** + * + * @param url + * @param fetchOnline + * @param socketTimeout + * @return an Object array containing + * + * + * + *
[0]the content as {@link InputStream}
[1]the content-length as {@link Integer}
+ */ + public Object[] getResource(URL url, boolean fetchOnline, int socketTimeout) { // load the url as resource from the web try { - // trying to load the resource body from cache - byte[] resource = cacheManager.loadResourceContent(url); + long contentLength = -1; - // if the content is not available in cache try to download it from web - if ((fetchOnline) && (resource == null)) { + // trying to load the resource body from cache + InputStream resource = this.cacheManager.getResourceContentStream(url); + if (resource != null) { + contentLength = this.cacheManager.getResourceContentLength(url); + } else if (fetchOnline) { + // if the content is not available in cache try to download it from web + // try to download the resource using a crawler - loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout); + plasmaHTCache.Entry entry = loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true); - // get the content from cache - resource = cacheManager.loadResourceContent(url); + // read resource body (if it is there) + byte[] resourceArray = entry.cacheArray(); + + // in case that the reosurce was not in ram, read it from disk + if (resourceArray == null) { + resource = this.cacheManager.getResourceContentStream(url); + contentLength = this.cacheManager.getResourceContentLength(url); + } else { + resource = new ByteArrayInputStream(resourceArray); + contentLength = resourceArray.length; + } + } else { + return null; } - return resource; + return new Object[]{resource,new Long(contentLength)}; } catch (IOException e) { return null; } } - public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws plasmaCrawlerException { + public plasmaHTCache.Entry loadResourceFromWeb( + URL url, + int socketTimeout, + boolean keepInMemory + ) throws plasmaCrawlerException { plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync( url, @@ -538,7 +608,8 @@ public class plasmaSnippetCache { null, 0, null, - socketTimeout + socketTimeout, + keepInMemory ); return result; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 939e2e21f..0adc1cabe 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -105,6 +105,7 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.lang.reflect.Constructor; import java.net.InetAddress; import java.net.MalformedURLException; @@ -2181,17 +2182,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser URL url = entry.url(); if (url == null) return 0; + InputStream resourceContent = null; try { - // get set of words - // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); - Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes()); + // get the resource content + Object[] resource = snippetCache.getResource(url, fetchOnline, 10000); + resourceContent = (InputStream) resource[0]; + Long resourceContentLength = (Long) resource[1]; + + // parse the resource + plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent); + + // getting parsed body input stream + InputStream docBodyInputStream = document.getText(); + + // getting word iterator + Iterator witer = plasmaCondenser.getWords(docBodyInputStream); + // delete all word references int count = removeReferences(urlhash, witer); + // finally delete the url entry itself urlPool.loadedURL.remove(urlhash); return count; } catch (ParserException e) { return 0; + } finally { + if (resourceContent != null) try { resourceContent.close(); } catch (Exception e) {/* ignore this */} } } diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index 363754e96..e514a09e4 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -144,7 +144,7 @@ public final class serverFileUtils { public static void writeX(InputStream source, String inputCharset, Writer procOS, OutputStream bufferOS, String outputCharset) throws IOException { InputStreamReader sourceReader = new InputStreamReader(source,inputCharset); - OutputStreamWriter bufferOSWriter = new OutputStreamWriter(bufferOS,outputCharset); + OutputStreamWriter bufferOSWriter = (bufferOS==null)?null:new OutputStreamWriter(bufferOS,outputCharset); writeX(sourceReader,procOS,bufferOSWriter); }