diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index b1f12ef9e..525c9d7e4 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -45,6 +45,7 @@ //if the shell's current path is HTROOT import java.io.IOException; +import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; @@ -60,6 +61,7 @@ import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.crawler.plasmaCrawlerException; import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.plasmaCrawlLURL.Entry; +import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -121,18 +123,20 @@ public class ViewFile { } // loading the resource content as byte array - byte[] resource = null; + InputStream resource = null; + long resourceLength = -1; IResourceInfo resInfo = null; String resMime = null; try { // trying to load the resource body - resource = sb.cacheManager.loadResourceContent(url); + resource = sb.cacheManager.getResourceContentStream(url); + resourceLength = sb.cacheManager.getResourceContentLength(url); // if the resource body was not cached we try to load it from web if (resource == null) { plasmaHTCache.Entry entry = null; try { - entry = sb.snippetCache.loadResourceFromWeb(url, 5000); + entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false); } catch (plasmaCrawlerException e) { prop.put("error",4); prop.put("error_errorText",e.getMessage()); @@ -142,11 +146,13 @@ public class ViewFile { if (entry != null) { resInfo = entry.getDocumentInfo(); - resource = sb.cacheManager.loadResourceContent(url); + resource = sb.cacheManager.getResourceContentStream(url); + resourceLength = sb.cacheManager.getResourceContentLength(url); } if (resource == null) { prop.put("error",4); + prop.put("error_errorText","No resource available"); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } @@ -172,21 +178,46 @@ public class ViewFile { httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig); if (responseHeader == null) { prop.put("error",4); + prop.put("error_errorText","Unable to load resource metadata."); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } + try { + resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader); + } catch (Exception e) { + prop.put("error",4); + prop.put("error_errorText",e.getMessage()); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } resMime = responseHeader.mime(); } } else { resMime = resInfo.getMimeType(); } } catch (IOException e) { + if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */} prop.put("error",4); + prop.put("error_errorText",e.getMessage()); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; - } - if (viewMode.equals("plain")) { - String content = new String(resource); + } + + if (viewMode.equals("plain")) { + + // TODO: how to handle very large files here ? + String content; + try { + content = new String(serverFileUtils.read(resource),"UTF-8"); + } catch (Exception e) { + prop.put("error",4); + prop.put("error_errorText",e.getMessage()); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } finally { + if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */} + } + content = content.replaceAll("<","<") .replaceAll(">",">") .replaceAll("\"",""") @@ -195,12 +226,15 @@ public class ViewFile { prop.put("error",0); prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); - prop.put("viewMode_plainText",content); - } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) { + prop.put("viewMode_plainText",content); + } else if (viewMode.equals("iframe")) { + prop.put("viewMode",VIEW_MODE_AS_IFRAME); + prop.put("viewMode_url",url.toString()); + } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) { // parsing the resource content plasmaParserDocument document = null; try { - document = sb.snippetCache.parseDocument(url, resource,resInfo); + document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo); if (document == null) { prop.put("error",5); prop.put("error_errorText","Unknown error"); @@ -212,7 +246,10 @@ public class ViewFile { prop.put("error_errorText",e.getMessage()); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; + } finally { + if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */} } + resMime = document.getMimeType(); if (viewMode.equals("parsed")) { @@ -223,9 +260,6 @@ public class ViewFile { prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); prop.put("viewMode_parsedText",content); - } else if (viewMode.equals("iframe")) { - prop.put("viewMode",VIEW_MODE_AS_IFRAME); - prop.put("viewMode_url",url.toString()); } else { prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); String[] sentences = document.getSentences(); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 1d1329873..30d765ee2 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -43,11 +43,14 @@ import java.awt.Container; import java.awt.Image; import java.awt.MediaTracker; import java.awt.Toolkit; +import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import de.anomic.http.httpHeader; import de.anomic.net.URL; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -70,9 +73,20 @@ public class ViewImage { int maxheight = post.getInt("maxheight", 0); int timeout = post.getInt("timeout", 5000); - // load image - byte[] imgb = sb.snippetCache.getResource(url, true, timeout); - if (imgb == null) return null; + // getting the image as stream + InputStream imgStream = (InputStream) sb.snippetCache.getResource(url, true, timeout)[0]; + if (imgStream == null) return null; + + // read image data + byte[] imgb = null; + try { + imgb = serverFileUtils.read(imgStream); + } catch (IOException e) { + return null; + } finally { + try { imgStream.close(); } catch (Exception e) {/* ignore this */} + } + // create image MediaTracker mediaTracker = new MediaTracker(new Container()); diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index ee910c023..0a5656058 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -1828,7 +1828,7 @@ do upload // return sbb.getBytes(); return serverFileUtils.read(this.getContentInputStream()); } - + /** * This method outputs the found content into an byte-array and * additionally outputs it to procOS. @@ -1837,9 +1837,13 @@ do upload * @return * @throws IOException */ - public byte[] writeContent(Object procOS) throws IOException { - int contentLength = (int) this.responseHeader.contentLength(); - serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength); + public byte[] writeContent(Object procOS, boolean returnByteArray) throws IOException { + serverByteBuffer sbb = null; + + if (returnByteArray) { + int contentLength = (int) this.responseHeader.contentLength(); + sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength); + } if (procOS instanceof OutputStream) { //writeContentX(httpc.this.clientInput, this.gzip, this.responseHeader.contentLength(), procOS, sbb); @@ -1852,7 +1856,7 @@ do upload throw new IllegalArgumentException("Invalid procOS object type '" + procOS.getClass().getName() + "'"); } - return sbb.getBytes(); + return (sbb==null)?null:sbb.getBytes(); } /** diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index ea7edfbcb..441bdef2c 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -662,7 +662,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt if ((contentLength > 0) && (contentLength < 1048576)) // if the length is known and < 1 MB { // ok, we don't write actually into a file, only to RAM, and schedule writing the file. - byte[] cacheArray = res.writeContent(hfos); + byte[] cacheArray = res.writeContent(hfos,true); this.theLogger.logFine("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length))); if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).finalize(); diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java index 2df4f4d4b..6960ea857 100644 --- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java @@ -80,6 +80,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW */ protected boolean done = false; + /* ============================================================ * Crawl job specific variables * ============================================================ */ @@ -92,6 +93,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW protected long startdate; protected plasmaCrawlProfile.entry profile; protected boolean acceptAllContent; + protected boolean keepInMemory; protected String errorMessage; @@ -159,22 +161,27 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW try { // The thread keeps running. - while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) { - if (this.done) { - synchronized (this) { - // return thread back into pool - this.myPool.returnObject(this.protocol,this); - - // We are waiting for a new task now. - if (!this.stopped && !this.destroyed && !this.isInterrupted()) { - this.wait(); + while (!this.stopped && !this.isInterrupted()) { + if (this.done) { + if (this.myPool != null && !this.myPool.isClosed) { + synchronized (this) { + // return thread back into pool + this.myPool.returnObject(this.protocol,this); + + // We are waiting for a new task now. + if (!this.stopped && !this.destroyed && !this.isInterrupted()) { + this.wait(); + } } + } else { + this.stopped = true; } } else { try { // executing the new task execute(); } finally { + // free memory reset(); } } @@ -231,6 +238,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW this.depth = theNewMsg.depth; this.profile = theNewMsg.profile; this.acceptAllContent = theNewMsg.acceptAllContent; + this.keepInMemory = theNewMsg.keepInMemory; this.startdate = System.currentTimeMillis(); @@ -260,6 +268,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW public void reset() { this.theMsg = null; + this.url = null; this.name = null; this.refererURLString = null; @@ -268,6 +277,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW this.startdate = 0; this.profile = null; this.acceptAllContent = false; + this.keepInMemory = false; + this.errorMessage = null; } diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index 54c1a8a60..ebb064048 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -262,8 +262,9 @@ public final class CrawlWorker extends AbstractCrawlWorker { } // we write the new cache entry to file system directly - res.writeContent(fos); - htCache.setCacheArray(null); + byte[] cacheArray = null; + cacheArray = res.writeContent(fos,this.keepInMemory); + htCache.setCacheArray(cacheArray); this.cacheManager.writeFileAnnouncement(cacheFile); } finally { if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */} diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java index 7f4f229ee..6f974957d 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java @@ -84,10 +84,14 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory { this.thePool = pool; } + public Object makeObject(Object key) throws Exception { + return makeObject(key, true); + } + /** * @see org.apache.commons.pool.PoolableObjectFactory#makeObject() */ - public Object makeObject(Object key) throws Exception { + public Object makeObject(Object key, boolean usePool) throws Exception { if (!(key instanceof String)) throw new IllegalArgumentException("The object key must be of type string."); @@ -109,11 +113,11 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory { // instantiating class plasmaCrawlWorker theCrawlWorker = (plasmaCrawlWorker) classConstructor.newInstance(new Object[] { this.theThreadGroup, - this.thePool, + (usePool)?this.thePool:null, this.sb, this.cacheManager, this.theLog - }); + }); // return the newly created object return theCrawlWorker; diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java index f69845901..7b52106ee 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java @@ -52,15 +52,22 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool; import de.anomic.server.logging.serverLog; public final class plasmaCrawlerPool extends GenericKeyedObjectPool { + + private plasmaCrawlerFactory theFactory; private final ThreadGroup theThreadGroup; public boolean isClosed = false; public plasmaCrawlerPool(plasmaCrawlerFactory objFactory, GenericKeyedObjectPool.Config config, ThreadGroup threadGroup) { super(objFactory, config); + this.theFactory = objFactory; this.theThreadGroup = threadGroup; objFactory.setPool(this); } + public plasmaCrawlerFactory getFactory() { + return this.theFactory; + } + public Object borrowObject(Object key) throws Exception { return super.borrowObject(key); } diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java index 2c7f1d701..baa413a06 100644 --- a/source/de/anomic/plasma/parser/AbstractParser.java +++ b/source/de/anomic/plasma/parser/AbstractParser.java @@ -90,7 +90,7 @@ public abstract class AbstractParser implements Parser{ * The source file file size in bytes if the source document was passed * in as file */ - protected long fileSize = -1; + protected long contentLength = -1; /** * The Constructor of this class. @@ -99,6 +99,15 @@ public abstract class AbstractParser implements Parser{ super(); this.libxDependencies = libxDependencies; } + + /** + * Set the content length of the source file. + * This value is needed by some parsers to decide + * if the parsed text could be hold in memory + */ + public void setContentLength(long length) { + this.contentLength = length; + } /** * Check if the parser was interrupted. @@ -185,7 +194,7 @@ public abstract class AbstractParser implements Parser{ BufferedInputStream contentInputStream = null; try { // getting the file size of the document - this.fileSize = sourceFile.length(); + this.contentLength = sourceFile.length(); // create a stream from the file contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); @@ -242,4 +251,8 @@ public abstract class AbstractParser implements Parser{ public String getName() { return this.parserName; } + + public void reset() { + this.contentLength = -1; + } } diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java index 83d0daa5c..a1adeae06 100644 --- a/source/de/anomic/plasma/parser/Parser.java +++ b/source/de/anomic/plasma/parser/Parser.java @@ -117,6 +117,8 @@ public interface Parser { */ public void reset(); + public void setContentLength(long length); + /** * @return Returns a list of library names that are needed by this parser */ diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java index 60621e7f8..53b2630dd 100644 --- a/source/de/anomic/plasma/parser/bzip/bzipParser.java +++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java @@ -138,7 +138,7 @@ public class bzipParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index b5a076399..92c116b4c 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -118,6 +118,7 @@ implements Parser { public void reset() { // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java index 0c2af76b3..a289eb361 100644 --- a/source/de/anomic/plasma/parser/gzip/gzipParser.java +++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java @@ -122,7 +122,7 @@ public class gzipParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java index 38665c6c5..70b01f471 100644 --- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java +++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java @@ -187,8 +187,7 @@ implements Parser { } } - public plasmaParserDocument parse(URL location, String mimeType,String charset, - InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType,String charset, InputStream source) throws ParserException, InterruptedException { File dstFile = null; try { dstFile = File.createTempFile("mimeTypeParser",".tmp"); @@ -208,6 +207,7 @@ implements Parser { public void reset() { // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index 9d8e9e011..6fc977644 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -197,8 +197,8 @@ public class odtParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 5f2fca420..174d8fbd9 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -132,7 +132,7 @@ public class pdfParser extends AbstractParser implements Parser { } // creating a writer for output - if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { writerFile = File.createTempFile("pdfParser",".tmp"); writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); } else { @@ -199,7 +199,8 @@ public class pdfParser extends AbstractParser implements Parser { } public void reset() { - this.fileSize = -1; + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 6c52cb97c..90ee23222 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -169,8 +169,8 @@ public class rpmParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index 41cf8573b..dbf3d11ee 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -211,8 +211,8 @@ public class rssParser extends AbstractParser implements Parser { } public void reset() { - // TODO Auto-generated method stub - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index c054f079e..de5e3ff72 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -124,6 +124,7 @@ implements Parser { public void reset() { // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 4d3ff6860..4f066232a 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -104,7 +104,7 @@ public class tarParser extends AbstractParser implements Parser { File outputFile = null; plasmaParserDocument subDoc = null; try { - if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { outputFile = File.createTempFile("zipParser",".tmp"); docText = new BufferedOutputStream(new FileOutputStream(outputFile)); } else { @@ -251,7 +251,7 @@ public class tarParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index e31010537..f553d5032 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -262,8 +262,8 @@ public class vcfParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index 8a523dbcf..e672df7dd 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -102,7 +102,7 @@ public class zipParser extends AbstractParser implements Parser { File outputFile = null; plasmaParserDocument subDoc = null; try { - if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { outputFile = File.createTempFile("zipParser",".tmp"); docText = new BufferedOutputStream(new FileOutputStream(outputFile)); } else { @@ -235,7 +235,7 @@ public class zipParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + // Nothing todo here at the moment + super.reset(); } } diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 6ca5bfc63..d72eb43f8 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -671,11 +671,16 @@ public final class plasmaCondenser { } */ + public static Iterator getWords(InputStream input) { + if (input == null) return null; + plasmaCondenser condenser = new plasmaCondenser(input); + return condenser.words(); + } + public static Iterator getWords(byte[] text) { if (text == null) return null; ByteArrayInputStream buffer = new ByteArrayInputStream(text); - plasmaCondenser condenser = new plasmaCondenser(buffer); - return condenser.words(); + return getWords(buffer); } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index e349da8bf..eac42b16e 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -163,15 +163,23 @@ public final class plasmaCrawlLoader extends Thread { return this.theThreadGroup; } - private void execute(plasmaCrawlLoaderMessage theMsg) throws Exception { + private void execute(plasmaCrawlLoaderMessage theMsg, boolean useThreadPool) throws Exception { // getting the protocol of the next URL String protocol = theMsg.url.getProtocol(); // TODO: remove this if (protocol.equals("https")) protocol = "http"; - // getting a new crawler from the crawler pool - plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol); + // get a new worker thread + plasmaCrawlWorker theWorker = null; + if (useThreadPool) { + // getting a new crawler from the crawler pool + theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol); + } else { + // create a new one + theWorker = (plasmaCrawlWorker) this.crawlwerPool.getFactory().makeObject(protocol,false); + } + if (theWorker == null) { this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + theMsg.url); } else { @@ -187,7 +195,7 @@ public final class plasmaCrawlLoader extends Thread { plasmaCrawlLoaderMessage theMsg = this.theQueue.waitForMessage(); // start new crawl job - this.execute(theMsg); + this.execute(theMsg, true); } catch (InterruptedException e) { Thread.interrupted(); @@ -218,7 +226,8 @@ public final class plasmaCrawlLoader extends Thread { String initiator, int depth, plasmaCrawlProfile.entry profile, - int timeout + int timeout, + boolean keepInMemory ) throws plasmaCrawlerException { plasmaHTCache.Entry result = null; @@ -235,13 +244,14 @@ public final class plasmaCrawlLoader extends Thread { profile, crawlingPriority, true, - timeout + timeout, + keepInMemory ); try { // start new crawl job - this.execute(theMsg); + this.execute(theMsg, false); // wait for the crawl job result result = theMsg.waitForResult(); @@ -283,7 +293,8 @@ public final class plasmaCrawlLoader extends Thread { profile, // crawling profile crawlingPriority, // crawling priority false, // only download documents whose mimetypes are enabled for the crawler - -1 // use default crawler timeout + -1, // use default crawler timeout + false // resource should not be kept in memory ); // adding the message to the queue diff --git a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java index b3d678c67..60929d606 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java +++ b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java @@ -56,6 +56,7 @@ public final class plasmaCrawlLoaderMessage { public final plasmaCrawlProfile.entry profile; public final boolean acceptAllContent; public final int timeout; + public final boolean keepInMemory; private serverSemaphore resultSync = null; private plasmaHTCache.Entry result; @@ -71,7 +72,8 @@ public final class plasmaCrawlLoaderMessage { plasmaCrawlProfile.entry profile, int crawlingPriority, boolean acceptAllContent, - int timeout + int timeout, + boolean keepInMemory ) { this.url = url; this.name = name; @@ -82,6 +84,7 @@ public final class plasmaCrawlLoaderMessage { this.crawlingPriority = crawlingPriority; this.acceptAllContent = acceptAllContent; this.timeout = timeout; + this.keepInMemory = keepInMemory; this.resultSync = new serverSemaphore(0); this.result = null; diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 293acdc28..17d34d372 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -53,9 +53,12 @@ package de.anomic.plasma; +import java.io.BufferedInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.net.InetAddress; import java.net.MalformedURLException; import java.util.Date; @@ -701,16 +704,51 @@ public final class plasmaHTCache { return null; } + /** + * @param url + * @return + * + * @deprecated dont't use this function to avoid OutOfMemory-Exceptions. + * Use {@link #getResourceContentStream(URL)} instead + */ public byte[] loadResourceContent(URL url) { // load the url as resource from the cache File f = getCachePath(url); - if (f.exists()) try { + if (f.exists() && f.canRead()) try { return serverFileUtils.read(f); } catch (IOException e) { return null; } return null; } + + /** + * Returns the content of a cached resource as {@link InputStream} + * @param url the requested resource + * @return the resource content as {@link InputStream}. In no data + * is available or the cached file is not readable, null + * is returned. + */ + public InputStream getResourceContentStream(URL url) { + // load the url as resource from the cache + File f = getCachePath(url); + if (f.exists() && f.canRead()) try { + return new BufferedInputStream(new FileInputStream(f)); + } catch (IOException e) { + this.log.logSevere("Unable to create a BufferedInputStream from file " + f,e); + return null; + } + return null; + } + + public long getResourceContentLength(URL url) { + // load the url as resource from the cache + File f = getCachePath(url); + if (f.exists() && f.canRead()) { + return f.length(); + } + return 0; + } public static boolean isPOST(String urlString) { return (urlString.indexOf("?") >= 0 || diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index e339bc0fe..b420b7ffc 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -45,11 +45,13 @@ package de.anomic.plasma; import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import java.net.URI; import java.util.Arrays; @@ -465,16 +467,25 @@ public final class plasmaParser { } catch (Exception e) {/* ignore this */} } - public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) + public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] sourceArray) throws InterruptedException, ParserException { - File tempFile = null; + ByteArrayInputStream byteIn = null; try { - // creating a temp file to store the byte array - tempFile = File.createTempFile("parseSource", ".tmp"); - serverFileUtils.write(source, tempFile); + if (this.theLogger.isFine()) + this.theLogger.logFine("Parsing '" + location + "' from byte-array"); + + // testing if the resource is not empty + if (sourceArray == null || sourceArray.length == 0) { + String errorMsg = "No resource content available."; + this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); + } + + // creating an InputStream + byteIn = new ByteArrayInputStream(sourceArray); // parsing the temp file - return parseSource(location, mimeType, charset, tempFile); + return parseSource(location, mimeType, charset, sourceArray.length, byteIn); } catch (Exception e) { // Interrupted- and Parser-Exceptions should pass through @@ -482,20 +493,65 @@ public final class plasmaParser { if (e instanceof ParserException) throw (ParserException) e; // log unexpected error - this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e); + this.theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e); throw new ParserException("Unexpected exception while parsing " + location,location, e); } finally { - if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */} + if (byteIn != null) try { byteIn.close(); } catch (Exception ex){/* ignore this */} } } - public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) - throws InterruptedException, ParserException { + public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) throws InterruptedException, ParserException { + + BufferedInputStream sourceStream = null; + try { + if (this.theLogger.isFine()) + this.theLogger.logFine("Parsing '" + location + "' from file"); + + // testing if the resource is not empty + if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { + String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available."; + this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); + } + + // create a new InputStream + sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); + + // parsing the data + return this.parseSource(location, theMimeType, theDocumentCharset, sourceFile.length(), sourceStream); + + } catch (Exception e) { + // Interrupted- and Parser-Exceptions should pass through + if (e instanceof InterruptedException) throw (InterruptedException) e; + if (e instanceof ParserException) throw (ParserException) e; + // log unexpected error + this.theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); + throw new ParserException("Unexpected exception while parsing " + location,location, e); + } finally { + if (sourceStream != null) try { sourceStream.close(); } catch (Exception ex){/* ignore this */} + } + } + + /** + * To parse a resource from an {@link InputStream} + * @param location the URL of the resource + * @param theMimeType the resource mimetype (null if unknown) + * @param theDocumentCharset the charset of the resource (null if unknown) + * @param contentLength the content length of the resource (-1 if unknown) + * @param sourceStream an {@link InputStream} containing the resource body + * @return the parsed {@link plasmaParserDocument document} + * @throws InterruptedException + * @throws ParserException + */ + public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, long contentLength, InputStream sourceStream) throws InterruptedException, ParserException { Parser theParser = null; String mimeType = null; try { + if (this.theLogger.isFine()) + this.theLogger.logFine("Parsing '" + location + "' from stream"); + // getting the mimetype of the document mimeType = getRealMimeType(theMimeType); @@ -513,66 +569,9 @@ public final class plasmaParser { throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT); } - // testing if the resource is not empty - if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { - String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available."; - this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); - } - - if (this.theLogger.isFine()) this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + - "' and file extension '" + fileExt + "'."); - - /* - * There are some problematic mimeType - fileExtension combination where we have to enforce - * a mimeType detection to get the proper parser for the content - * - * - application/zip + .odt - * - text/plain + .odt - * - text/plain + .vcf - * - text/xml + .rss - * - text/xml + .atom - * - * In all these cases we can trust the fileExtension and have to determine the proper mimeType. - * - */ - -// // Handling of not trustable mimeTypes -// // - text/plain -// // - text/xml -// // - application/octet-stream -// // - application/zip -// if ( -// (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) || -// (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt")) -// ) { -// if (this.theLogger.isFine()) -// this.theLogger.logFine("Document " + location + " has an mimeType '" + mimeType + -// "' that seems not to be correct for file extension '" + fileExt + "'."); -// -// if (enabledParserList.containsKey("application/octet-stream")) { -// theParser = this.getParser("application/octet-stream"); -// Object newMime = theParser.getClass().getMethod("getMimeType", new Class[]{File.class}).invoke(theParser, sourceFile); -// if (newMime == null) -// if (newMime instanceof String) { -// String newMimeType = (String)newMime; -// if ((newMimeType.equals("application/octet-stream")) { -// return null; -// } -// mimeType = newMimeType; -// } -// } else { -// return null; -// } -// } else if (mimeType.equalsIgnoreCase("application/zip") && fileExt.equalsIgnoreCase("odt")){ -// if (enabledParserList.containsKey("application/vnd.oasis.opendocument.text")) { -// mimeType = "application/vnd.oasis.opendocument.text"; -// } else { -// return null; -// } -// } + "' and file extension '" + fileExt + "'."); // getting the correct parser for the given mimeType theParser = this.getParser(mimeType); @@ -580,9 +579,12 @@ public final class plasmaParser { // if a parser was found we use it ... plasmaParserDocument doc = null; if (theParser != null) { - doc = theParser.parse(location, mimeType,documentCharset,sourceFile); + // set the content length of the resource + theParser.setContentLength(contentLength); + // parse the resource + doc = theParser.parse(location, mimeType,documentCharset,sourceStream); } else if (realtimeParsableMimeTypesContains(mimeType)) { - doc = parseHtml(location, mimeType, documentCharset, sourceFile); + doc = parseHtml(location, mimeType, documentCharset, sourceStream); } else { String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); @@ -611,14 +613,13 @@ public final class plasmaParser { if (theParser != null) { try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { /* ignore this */} } - } + } } - private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException { + private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, InputStream sourceStream) throws IOException, ParserException { // ...otherwise we make a scraper and transformer - FileInputStream fileIn = new FileInputStream(sourceFile); - htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false); + htmlFilterInputStream htmlFilter = new htmlFilterInputStream(sourceStream,documentCharset,location,null,false); String charset = htmlFilter.detectCharset(); if (charset == null) { charset = documentCharset; @@ -763,7 +764,7 @@ public final class plasmaParser { //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out try { - File contentFile = null; + Object content = null; URL contentURL = null; String contentMimeType = "application/octet-stream"; String charSet = "UTF-8"; @@ -774,17 +775,13 @@ public final class plasmaParser { String mode = args[0]; if (mode.equalsIgnoreCase("-f")) { - contentFile = new File(args[1]); - contentURL = new URL(contentFile); + content = new File(args[1]); + contentURL = new URL((File)content); } else if (mode.equalsIgnoreCase("-u")) { contentURL = new URL(args[1]); // downloading the document content - byte[] contentBytes = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null); - - contentFile = File.createTempFile("content",".tmp"); - contentFile.deleteOnExit(); - serverFileUtils.write(contentBytes, contentFile); + content = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null); } if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) { @@ -805,7 +802,12 @@ public final class plasmaParser { plasmaParser.enableAllParsers(PARSER_MODE_PROXY); // parsing the content - plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile); + plasmaParserDocument document = null; + if (content instanceof byte[]) { + document = theParser.parseSource(contentURL, contentMimeType, charSet, (byte[])content); + } else if (content instanceof File) { + document = theParser.parseSource(contentURL, contentMimeType, charSet, (File)content); + } // printing out all parsed sentences if (document != null) { diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index 129302433..d6ea1bd9d 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -41,6 +41,7 @@ package de.anomic.plasma; +import java.io.InputStream; import java.net.MalformedURLException; import java.util.Iterator; import java.util.Map; @@ -59,13 +60,18 @@ public final class plasmaSearchImages { long start = System.currentTimeMillis(); this.images = new TreeSet(); if (maxTime > 10) { - byte[] res = sc.getResource(url, true, (int) maxTime); + Object[] resource = sc.getResource(url, true, (int) maxTime); + InputStream res = (InputStream) resource[0]; + Long resLength = (Long) resource[1]; if (res != null) { plasmaParserDocument document = null; try { - document = sc.parseDocument(url, res); + // parse the document + document = sc.parseDocument(url, resLength.longValue(), res); } catch (ParserException e) { // parsing failed + } finally { + try { res.close(); } catch (Exception e) {/* ignore this */} } if (document == null) return; diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index d2a1f6864..80a63a1a1 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -44,7 +44,9 @@ package de.anomic.plasma; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; @@ -187,46 +189,62 @@ public class plasmaSnippetCache { * LOADING RESOURCE DATA * =========================================================================== */ // if the snippet is not in the cache, we can try to get it from the htcache - byte[] resource = null; - IResourceInfo docInfo = null; + long resContentLength = 0; + InputStream resContent = null; + IResourceInfo resInfo = null; try { // trying to load the resource from the cache - resource = this.cacheManager.loadResourceContent(url); + resContent = this.cacheManager.getResourceContentStream(url); + if (resContent != null) { + // if the content was found + resContentLength = this.cacheManager.getResourceContentLength(url); + + // getting resource metadata + resInfo = this.cacheManager.loadResourceInfo(url); - // if not found try to download it - if ((resource == null) && (fetchOnline)) { - // download resource using the crawler - plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout); + } else if (fetchOnline) { + // if not found try to download it - // getting resource metadata (e.g. the http headers for http resources) - if (entry != null) docInfo = entry.getDocumentInfo(); + // download resource using the crawler and keep resource in memory if possible + plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true); - // read resource body (if it is there) - resource = entry.cacheArray(); + // getting resource metadata (e.g. the http headers for http resources) + if (entry != null) { + resInfo = entry.getDocumentInfo(); + + // read resource body (if it is there) + byte []resourceArray = entry.cacheArray(); + if (resourceArray != null) { + resContent = new ByteArrayInputStream(resourceArray); + resContentLength = resourceArray.length; + } else { + resContent = this.cacheManager.getResourceContentStream(url); + resContentLength = this.cacheManager.getResourceContentLength(url); + } + } - // in case that the reosurce was not in ram, read it from disk - if (resource == null) resource = this.cacheManager.loadResourceContent(url); + // if it is still not available, report an error + if (resContent == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL"); - // if it is still not available, throw exception - if (resource == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL"); - source = SOURCE_WEB; + } else { + return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available"); } } catch (Exception e) { if (!(e instanceof plasmaCrawlerException)) e.printStackTrace(); return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage()); - } + } - if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available"); - /* =========================================================================== * PARSING RESOURCE * =========================================================================== */ plasmaParserDocument document = null; try { - document = parseDocument(url, resource, docInfo); + document = parseDocument(url, resContentLength, resContent, resInfo); } catch (ParserException e) { return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed + } finally { + try { resContent.close(); } catch (Exception e) {/* ignore this */} } if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed @@ -263,30 +281,40 @@ public class plasmaSnippetCache { * @return the parsed document as {@link plasmaParserDocument} */ public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) { - byte[] resource = null; IResourceInfo docInfo = null; try { // trying to load the resource body from cache - resource = this.cacheManager.loadResourceContent(url); + InputStream content = this.cacheManager.getResourceContentStream(url); + long resourceLength = this.cacheManager.getResourceContentLength(url); // if not available try to load resource from web - if ((fetchOnline) && (resource == null)) { + if ((fetchOnline) && (content == null)) { // download resource using crawler - plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); + plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000, true); // fetching metadata of the resource (e.g. http headers for http resource) - if (entry != null) docInfo = entry.getDocumentInfo(); - - // getting the resource body from the cache - resource = this.cacheManager.loadResourceContent(url); + if (entry != null) { + docInfo = entry.getDocumentInfo(); + + byte[] resourceArray = entry.cacheArray(); + if (resourceArray != null) { + // read resource body (if it is there) + content = new ByteArrayInputStream(resourceArray); + resourceLength = resourceArray.length; + } else { + // in case that the reosurce was not in ram, read it from disk + content = this.cacheManager.getResourceContentStream(url); + resourceLength = this.cacheManager.getResourceContentLength(url); + } + } } else { // trying to load resource metadata docInfo = this.cacheManager.loadResourceInfo(url); } // parsing document - if (resource == null) return null; - return parseDocument(url, resource, docInfo); + if (content == null) return null; + return parseDocument(url, resourceLength, content, docInfo); } catch (ParserException e) { this.log.logWarning("Unable to parse resource. " + e.getMessage()); return null; @@ -446,15 +474,24 @@ public class plasmaSnippetCache { return map; } - public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException { - return parseDocument(url, resource, null); + public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream) throws ParserException { + return parseDocument(url, contentLength, resourceStream, null); } - public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException { + /** + * Parse the resource + * @param url the URL of the resource + * @param contentLength the contentLength of the resource + * @param resourceStream the resource body as stream + * @param docInfo metadata about the resource + * @return the extracted data + * @throws ParserException + */ + public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream, IResourceInfo docInfo) throws ParserException { try { - if (resource == null) return null; + if (resourceStream == null) return null; - // if no resource metadata is available, try to load it + // STEP 1: if no resource metadata is available, try to load it from cache if (docInfo == null) { // try to get the header from the htcache directory try { @@ -464,18 +501,21 @@ public class plasmaSnippetCache { } } + // STEP 2: if the metadata is still null try to download it from web + if ((docInfo == null) && (url.getProtocol().startsWith("http"))) { // TODO: we need a better solution here - // encapsulate this in the crawlLoader class - if ((docInfo == null) && (url.getProtocol().startsWith("http"))) { - // getting URL mimeType - try { - httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig); - docInfo = this.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, header); - } catch (Exception e) { - // ingore this. http header download failed - } - } + // e.g. encapsulate this in the crawlLoader class + + // getting URL mimeType + try { + httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig); + docInfo = this.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, header); + } catch (Exception e) { + // ingore this. http header download failed + } + } + // STEP 3: if the metadata is still null try to guess the mimeType of the resource if (docInfo == null) { String filename = this.cacheManager.getCachePath(url).getName(); int p = filename.lastIndexOf('.'); @@ -495,12 +535,12 @@ public class plasmaSnippetCache { supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1)); } - return this.parser.parseSource(url, supposedMime, null, resource); + return this.parser.parseSource(url, supposedMime, null, contentLength, resourceStream); } return null; - } + } if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) { - return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), resource); + return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), contentLength, resourceStream); } return null; } catch (InterruptedException e) { @@ -509,27 +549,57 @@ public class plasmaSnippetCache { } } - public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) { + /** + * + * @param url + * @param fetchOnline + * @param socketTimeout + * @return an Object array containing + * + * + * + *
[0]the content as {@link InputStream}
[1]the content-length as {@link Integer}
+ */ + public Object[] getResource(URL url, boolean fetchOnline, int socketTimeout) { // load the url as resource from the web try { - // trying to load the resource body from cache - byte[] resource = cacheManager.loadResourceContent(url); + long contentLength = -1; - // if the content is not available in cache try to download it from web - if ((fetchOnline) && (resource == null)) { + // trying to load the resource body from cache + InputStream resource = this.cacheManager.getResourceContentStream(url); + if (resource != null) { + contentLength = this.cacheManager.getResourceContentLength(url); + } else if (fetchOnline) { + // if the content is not available in cache try to download it from web + // try to download the resource using a crawler - loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout); + plasmaHTCache.Entry entry = loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true); - // get the content from cache - resource = cacheManager.loadResourceContent(url); + // read resource body (if it is there) + byte[] resourceArray = entry.cacheArray(); + + // in case that the reosurce was not in ram, read it from disk + if (resourceArray == null) { + resource = this.cacheManager.getResourceContentStream(url); + contentLength = this.cacheManager.getResourceContentLength(url); + } else { + resource = new ByteArrayInputStream(resourceArray); + contentLength = resourceArray.length; + } + } else { + return null; } - return resource; + return new Object[]{resource,new Long(contentLength)}; } catch (IOException e) { return null; } } - public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws plasmaCrawlerException { + public plasmaHTCache.Entry loadResourceFromWeb( + URL url, + int socketTimeout, + boolean keepInMemory + ) throws plasmaCrawlerException { plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync( url, @@ -538,7 +608,8 @@ public class plasmaSnippetCache { null, 0, null, - socketTimeout + socketTimeout, + keepInMemory ); return result; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 939e2e21f..0adc1cabe 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -105,6 +105,7 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.lang.reflect.Constructor; import java.net.InetAddress; import java.net.MalformedURLException; @@ -2181,17 +2182,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser URL url = entry.url(); if (url == null) return 0; + InputStream resourceContent = null; try { - // get set of words - // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); - Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes()); + // get the resource content + Object[] resource = snippetCache.getResource(url, fetchOnline, 10000); + resourceContent = (InputStream) resource[0]; + Long resourceContentLength = (Long) resource[1]; + + // parse the resource + plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent); + + // getting parsed body input stream + InputStream docBodyInputStream = document.getText(); + + // getting word iterator + Iterator witer = plasmaCondenser.getWords(docBodyInputStream); + // delete all word references int count = removeReferences(urlhash, witer); + // finally delete the url entry itself urlPool.loadedURL.remove(urlhash); return count; } catch (ParserException e) { return 0; + } finally { + if (resourceContent != null) try { resourceContent.close(); } catch (Exception e) {/* ignore this */} } } diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index 363754e96..e514a09e4 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -144,7 +144,7 @@ public final class serverFileUtils { public static void writeX(InputStream source, String inputCharset, Writer procOS, OutputStream bufferOS, String outputCharset) throws IOException { InputStreamReader sourceReader = new InputStreamReader(source,inputCharset); - OutputStreamWriter bufferOSWriter = new OutputStreamWriter(bufferOS,outputCharset); + OutputStreamWriter bufferOSWriter = (bufferOS==null)?null:new OutputStreamWriter(bufferOS,outputCharset); writeX(sourceReader,procOS,bufferOSWriter); }