diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index b1f12ef9e..525c9d7e4 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -45,6 +45,7 @@
//if the shell's current path is HTROOT
import java.io.IOException;
+import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
@@ -60,6 +61,7 @@ import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
+import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -121,18 +123,20 @@ public class ViewFile {
}
// loading the resource content as byte array
- byte[] resource = null;
+ InputStream resource = null;
+ long resourceLength = -1;
IResourceInfo resInfo = null;
String resMime = null;
try {
// trying to load the resource body
- resource = sb.cacheManager.loadResourceContent(url);
+ resource = sb.cacheManager.getResourceContentStream(url);
+ resourceLength = sb.cacheManager.getResourceContentLength(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
- entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
+ entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
} catch (plasmaCrawlerException e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
@@ -142,11 +146,13 @@ public class ViewFile {
if (entry != null) {
resInfo = entry.getDocumentInfo();
- resource = sb.cacheManager.loadResourceContent(url);
+ resource = sb.cacheManager.getResourceContentStream(url);
+ resourceLength = sb.cacheManager.getResourceContentLength(url);
}
if (resource == null) {
prop.put("error",4);
+ prop.put("error_errorText","No resource available");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
@@ -172,21 +178,46 @@ public class ViewFile {
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
+ prop.put("error_errorText","Unable to load resource metadata.");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
+ try {
+ resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
+ } catch (Exception e) {
+ prop.put("error",4);
+ prop.put("error_errorText",e.getMessage());
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
resMime = responseHeader.mime();
}
} else {
resMime = resInfo.getMimeType();
}
} catch (IOException e) {
+ if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */}
prop.put("error",4);
+ prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
- }
- if (viewMode.equals("plain")) {
- String content = new String(resource);
+ }
+
+ if (viewMode.equals("plain")) {
+
+ // TODO: how to handle very large files here ?
+ String content;
+ try {
+ content = new String(serverFileUtils.read(resource),"UTF-8");
+ } catch (Exception e) {
+ prop.put("error",4);
+ prop.put("error_errorText",e.getMessage());
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ } finally {
+ if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
+ }
+
content = content.replaceAll("<","<")
.replaceAll(">",">")
.replaceAll("\"",""")
@@ -195,12 +226,15 @@ public class ViewFile {
prop.put("error",0);
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
- prop.put("viewMode_plainText",content);
- } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
+ prop.put("viewMode_plainText",content);
+ } else if (viewMode.equals("iframe")) {
+ prop.put("viewMode",VIEW_MODE_AS_IFRAME);
+ prop.put("viewMode_url",url.toString());
+ } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
- document = sb.snippetCache.parseDocument(url, resource,resInfo);
+ document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo);
if (document == null) {
prop.put("error",5);
prop.put("error_errorText","Unknown error");
@@ -212,7 +246,10 @@ public class ViewFile {
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
+ } finally {
+ if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
}
+
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
@@ -223,9 +260,6 @@ public class ViewFile {
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText",content);
- } else if (viewMode.equals("iframe")) {
- prop.put("viewMode",VIEW_MODE_AS_IFRAME);
- prop.put("viewMode_url",url.toString());
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
String[] sentences = document.getSentences();
diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index 1d1329873..30d765ee2 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -43,11 +43,14 @@ import java.awt.Container;
import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.Toolkit;
+import java.io.IOException;
+import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -70,9 +73,20 @@ public class ViewImage {
int maxheight = post.getInt("maxheight", 0);
int timeout = post.getInt("timeout", 5000);
- // load image
- byte[] imgb = sb.snippetCache.getResource(url, true, timeout);
- if (imgb == null) return null;
+ // getting the image as stream
+ InputStream imgStream = (InputStream) sb.snippetCache.getResource(url, true, timeout)[0];
+ if (imgStream == null) return null;
+
+ // read image data
+ byte[] imgb = null;
+ try {
+ imgb = serverFileUtils.read(imgStream);
+ } catch (IOException e) {
+ return null;
+ } finally {
+ try { imgStream.close(); } catch (Exception e) {/* ignore this */}
+ }
+
// create image
MediaTracker mediaTracker = new MediaTracker(new Container());
diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java
index ee910c023..0a5656058 100644
--- a/source/de/anomic/http/httpc.java
+++ b/source/de/anomic/http/httpc.java
@@ -1828,7 +1828,7 @@ do upload
// return sbb.getBytes();
return serverFileUtils.read(this.getContentInputStream());
}
-
+
/**
* This method outputs the found content into an byte-array and
* additionally outputs it to procOS.
@@ -1837,9 +1837,13 @@ do upload
* @return
* @throws IOException
*/
- public byte[] writeContent(Object procOS) throws IOException {
- int contentLength = (int) this.responseHeader.contentLength();
- serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
+ public byte[] writeContent(Object procOS, boolean returnByteArray) throws IOException {
+ serverByteBuffer sbb = null;
+
+ if (returnByteArray) {
+ int contentLength = (int) this.responseHeader.contentLength();
+ sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
+ }
if (procOS instanceof OutputStream) {
//writeContentX(httpc.this.clientInput, this.gzip, this.responseHeader.contentLength(), procOS, sbb);
@@ -1852,7 +1856,7 @@ do upload
throw new IllegalArgumentException("Invalid procOS object type '" + procOS.getClass().getName() + "'");
}
- return sbb.getBytes();
+ return (sbb==null)?null:sbb.getBytes();
}
/**
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index ea7edfbcb..441bdef2c 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -662,7 +662,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
if ((contentLength > 0) && (contentLength < 1048576)) // if the length is known and < 1 MB
{
// ok, we don't write actually into a file, only to RAM, and schedule writing the file.
- byte[] cacheArray = res.writeContent(hfos);
+ byte[] cacheArray = res.writeContent(hfos,true);
this.theLogger.logFine("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length)));
if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).finalize();
diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
index 2df4f4d4b..6960ea857 100644
--- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
+++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
@@ -80,6 +80,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
*/
protected boolean done = false;
+
/* ============================================================
* Crawl job specific variables
* ============================================================ */
@@ -92,6 +93,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
protected long startdate;
protected plasmaCrawlProfile.entry profile;
protected boolean acceptAllContent;
+ protected boolean keepInMemory;
protected String errorMessage;
@@ -159,22 +161,27 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
try {
// The thread keeps running.
- while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) {
- if (this.done) {
- synchronized (this) {
- // return thread back into pool
- this.myPool.returnObject(this.protocol,this);
-
- // We are waiting for a new task now.
- if (!this.stopped && !this.destroyed && !this.isInterrupted()) {
- this.wait();
+ while (!this.stopped && !this.isInterrupted()) {
+ if (this.done) {
+ if (this.myPool != null && !this.myPool.isClosed) {
+ synchronized (this) {
+ // return thread back into pool
+ this.myPool.returnObject(this.protocol,this);
+
+ // We are waiting for a new task now.
+ if (!this.stopped && !this.destroyed && !this.isInterrupted()) {
+ this.wait();
+ }
}
+ } else {
+ this.stopped = true;
}
} else {
try {
// executing the new task
execute();
} finally {
+ // free memory
reset();
}
}
@@ -231,6 +238,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.depth = theNewMsg.depth;
this.profile = theNewMsg.profile;
this.acceptAllContent = theNewMsg.acceptAllContent;
+ this.keepInMemory = theNewMsg.keepInMemory;
this.startdate = System.currentTimeMillis();
@@ -260,6 +268,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
public void reset() {
this.theMsg = null;
+
this.url = null;
this.name = null;
this.refererURLString = null;
@@ -268,6 +277,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.startdate = 0;
this.profile = null;
this.acceptAllContent = false;
+ this.keepInMemory = false;
+
this.errorMessage = null;
}
diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java
index 54c1a8a60..ebb064048 100644
--- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java
+++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java
@@ -262,8 +262,9 @@ public final class CrawlWorker extends AbstractCrawlWorker {
}
// we write the new cache entry to file system directly
- res.writeContent(fos);
- htCache.setCacheArray(null);
+ byte[] cacheArray = null;
+ cacheArray = res.writeContent(fos,this.keepInMemory);
+ htCache.setCacheArray(cacheArray);
this.cacheManager.writeFileAnnouncement(cacheFile);
} finally {
if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */}
diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java
index 7f4f229ee..6f974957d 100644
--- a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java
+++ b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java
@@ -84,10 +84,14 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
this.thePool = pool;
}
+ public Object makeObject(Object key) throws Exception {
+ return makeObject(key, true);
+ }
+
/**
* @see org.apache.commons.pool.PoolableObjectFactory#makeObject()
*/
- public Object makeObject(Object key) throws Exception {
+ public Object makeObject(Object key, boolean usePool) throws Exception {
if (!(key instanceof String))
throw new IllegalArgumentException("The object key must be of type string.");
@@ -109,11 +113,11 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
// instantiating class
plasmaCrawlWorker theCrawlWorker = (plasmaCrawlWorker) classConstructor.newInstance(new Object[] {
this.theThreadGroup,
- this.thePool,
+ (usePool)?this.thePool:null,
this.sb,
this.cacheManager,
this.theLog
- });
+ });
// return the newly created object
return theCrawlWorker;
diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java
index f69845901..7b52106ee 100644
--- a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java
+++ b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java
@@ -52,15 +52,22 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import de.anomic.server.logging.serverLog;
public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
+
+ private plasmaCrawlerFactory theFactory;
private final ThreadGroup theThreadGroup;
public boolean isClosed = false;
public plasmaCrawlerPool(plasmaCrawlerFactory objFactory, GenericKeyedObjectPool.Config config, ThreadGroup threadGroup) {
super(objFactory, config);
+ this.theFactory = objFactory;
this.theThreadGroup = threadGroup;
objFactory.setPool(this);
}
+ public plasmaCrawlerFactory getFactory() {
+ return this.theFactory;
+ }
+
public Object borrowObject(Object key) throws Exception {
return super.borrowObject(key);
}
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index 2c7f1d701..baa413a06 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -90,7 +90,7 @@ public abstract class AbstractParser implements Parser{
* The source file file size in bytes if the source document was passed
* in as file
*/
- protected long fileSize = -1;
+ protected long contentLength = -1;
/**
* The Constructor of this class.
@@ -99,6 +99,15 @@ public abstract class AbstractParser implements Parser{
super();
this.libxDependencies = libxDependencies;
}
+
+ /**
+ * Set the content length of the source file.
+ * This value is needed by some parsers to decide
+ * if the parsed text could be hold in memory
+ */
+ public void setContentLength(long length) {
+ this.contentLength = length;
+ }
/**
* Check if the parser was interrupted.
@@ -185,7 +194,7 @@ public abstract class AbstractParser implements Parser{
BufferedInputStream contentInputStream = null;
try {
// getting the file size of the document
- this.fileSize = sourceFile.length();
+ this.contentLength = sourceFile.length();
// create a stream from the file
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
@@ -242,4 +251,8 @@ public abstract class AbstractParser implements Parser{
public String getName() {
return this.parserName;
}
+
+ public void reset() {
+ this.contentLength = -1;
+ }
}
diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java
index 83d0daa5c..a1adeae06 100644
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@@ -117,6 +117,8 @@ public interface Parser {
*/
public void reset();
+ public void setContentLength(long length);
+
/**
* @return Returns a list of library names that are needed by this parser
*/
diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java
index 60621e7f8..53b2630dd 100644
--- a/source/de/anomic/plasma/parser/bzip/bzipParser.java
+++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java
@@ -138,7 +138,7 @@ public class bzipParser extends AbstractParser implements Parser {
}
public void reset() {
- // Nothing todo here at the moment
-
+ // Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java
index b5a076399..92c116b4c 100644
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@@ -118,6 +118,7 @@ implements Parser {
public void reset() {
// Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java
index 0c2af76b3..a289eb361 100644
--- a/source/de/anomic/plasma/parser/gzip/gzipParser.java
+++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java
@@ -122,7 +122,7 @@ public class gzipParser extends AbstractParser implements Parser {
}
public void reset() {
- // Nothing todo here at the moment
-
+ // Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
index 38665c6c5..70b01f471 100644
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@@ -187,8 +187,7 @@ implements Parser {
}
}
- public plasmaParserDocument parse(URL location, String mimeType,String charset,
- InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType,String charset, InputStream source) throws ParserException, InterruptedException {
File dstFile = null;
try {
dstFile = File.createTempFile("mimeTypeParser",".tmp");
@@ -208,6 +207,7 @@ implements Parser {
public void reset() {
// Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java
index 9d8e9e011..6fc977644 100644
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@@ -197,8 +197,8 @@ public class odtParser extends AbstractParser implements Parser {
}
public void reset() {
- // Nothing todo here at the moment
-
+ // Nothing todo here at the moment
+ super.reset();
}
public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index 5f2fca420..174d8fbd9 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -132,7 +132,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
// creating a writer for output
- if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+ if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("pdfParser",".tmp");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
@@ -199,7 +199,8 @@ public class pdfParser extends AbstractParser implements Parser {
}
public void reset() {
- this.fileSize = -1;
+ // Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java
index 6c52cb97c..90ee23222 100644
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@@ -169,8 +169,8 @@ public class rpmParser extends AbstractParser implements Parser {
}
public void reset() {
- // Nothing todo here at the moment
-
+ // Nothing todo here at the moment
+ super.reset();
}
public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index 41cf8573b..dbf3d11ee 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -211,8 +211,8 @@ public class rssParser extends AbstractParser implements Parser {
}
public void reset() {
- // TODO Auto-generated method stub
-
+ // Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index c054f079e..de5e3ff72 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -124,6 +124,7 @@ implements Parser {
public void reset() {
// Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index 4d3ff6860..4f066232a 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -104,7 +104,7 @@ public class tarParser extends AbstractParser implements Parser {
File outputFile = null;
plasmaParserDocument subDoc = null;
try {
- if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+ if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".tmp");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
@@ -251,7 +251,7 @@ public class tarParser extends AbstractParser implements Parser {
}
public void reset() {
- // Nothing todo here at the moment
-
+ // Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java
index e31010537..f553d5032 100644
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@@ -262,8 +262,8 @@ public class vcfParser extends AbstractParser implements Parser {
}
public void reset() {
- // Nothing todo here at the moment
-
+ // Nothing todo here at the moment
+ super.reset();
}
public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index 8a523dbcf..e672df7dd 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -102,7 +102,7 @@ public class zipParser extends AbstractParser implements Parser {
File outputFile = null;
plasmaParserDocument subDoc = null;
try {
- if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+ if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".tmp");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
@@ -235,7 +235,7 @@ public class zipParser extends AbstractParser implements Parser {
}
public void reset() {
- // Nothing todo here at the moment
-
+ // Nothing todo here at the moment
+ super.reset();
}
}
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 6ca5bfc63..d72eb43f8 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -671,11 +671,16 @@ public final class plasmaCondenser {
}
*/
+ public static Iterator getWords(InputStream input) {
+ if (input == null) return null;
+ plasmaCondenser condenser = new plasmaCondenser(input);
+ return condenser.words();
+ }
+
public static Iterator getWords(byte[] text) {
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
- plasmaCondenser condenser = new plasmaCondenser(buffer);
- return condenser.words();
+ return getWords(buffer);
}
public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java
index e349da8bf..eac42b16e 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@@ -163,15 +163,23 @@ public final class plasmaCrawlLoader extends Thread {
return this.theThreadGroup;
}
- private void execute(plasmaCrawlLoaderMessage theMsg) throws Exception {
+ private void execute(plasmaCrawlLoaderMessage theMsg, boolean useThreadPool) throws Exception {
// getting the protocol of the next URL
String protocol = theMsg.url.getProtocol();
// TODO: remove this
if (protocol.equals("https")) protocol = "http";
- // getting a new crawler from the crawler pool
- plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
+ // get a new worker thread
+ plasmaCrawlWorker theWorker = null;
+ if (useThreadPool) {
+ // getting a new crawler from the crawler pool
+ theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
+ } else {
+ // create a new one
+ theWorker = (plasmaCrawlWorker) this.crawlwerPool.getFactory().makeObject(protocol,false);
+ }
+
if (theWorker == null) {
this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + theMsg.url);
} else {
@@ -187,7 +195,7 @@ public final class plasmaCrawlLoader extends Thread {
plasmaCrawlLoaderMessage theMsg = this.theQueue.waitForMessage();
// start new crawl job
- this.execute(theMsg);
+ this.execute(theMsg, true);
} catch (InterruptedException e) {
Thread.interrupted();
@@ -218,7 +226,8 @@ public final class plasmaCrawlLoader extends Thread {
String initiator,
int depth,
plasmaCrawlProfile.entry profile,
- int timeout
+ int timeout,
+ boolean keepInMemory
) throws plasmaCrawlerException {
plasmaHTCache.Entry result = null;
@@ -235,13 +244,14 @@ public final class plasmaCrawlLoader extends Thread {
profile,
crawlingPriority,
true,
- timeout
+ timeout,
+ keepInMemory
);
try {
// start new crawl job
- this.execute(theMsg);
+ this.execute(theMsg, false);
// wait for the crawl job result
result = theMsg.waitForResult();
@@ -283,7 +293,8 @@ public final class plasmaCrawlLoader extends Thread {
profile, // crawling profile
crawlingPriority, // crawling priority
false, // only download documents whose mimetypes are enabled for the crawler
- -1 // use default crawler timeout
+ -1, // use default crawler timeout
+ false // resource should not be kept in memory
);
// adding the message to the queue
diff --git a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
index b3d678c67..60929d606 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
@@ -56,6 +56,7 @@ public final class plasmaCrawlLoaderMessage {
public final plasmaCrawlProfile.entry profile;
public final boolean acceptAllContent;
public final int timeout;
+ public final boolean keepInMemory;
private serverSemaphore resultSync = null;
private plasmaHTCache.Entry result;
@@ -71,7 +72,8 @@ public final class plasmaCrawlLoaderMessage {
plasmaCrawlProfile.entry profile,
int crawlingPriority,
boolean acceptAllContent,
- int timeout
+ int timeout,
+ boolean keepInMemory
) {
this.url = url;
this.name = name;
@@ -82,6 +84,7 @@ public final class plasmaCrawlLoaderMessage {
this.crawlingPriority = crawlingPriority;
this.acceptAllContent = acceptAllContent;
this.timeout = timeout;
+ this.keepInMemory = keepInMemory;
this.resultSync = new serverSemaphore(0);
this.result = null;
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index 293acdc28..17d34d372 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -53,9 +53,12 @@
package de.anomic.plasma;
+import java.io.BufferedInputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.InputStream;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.Date;
@@ -701,16 +704,51 @@ public final class plasmaHTCache {
return null;
}
+ /**
+ * @param url
+ * @return
+ *
+ * @deprecated dont't use this function to avoid OutOfMemory-Exceptions.
+ * Use {@link #getResourceContentStream(URL)} instead
+ */
public byte[] loadResourceContent(URL url) {
// load the url as resource from the cache
File f = getCachePath(url);
- if (f.exists()) try {
+ if (f.exists() && f.canRead()) try {
return serverFileUtils.read(f);
} catch (IOException e) {
return null;
}
return null;
}
+
+ /**
+ * Returns the content of a cached resource as {@link InputStream}
+ * @param url the requested resource
+ * @return the resource content as {@link InputStream}. In no data
+ * is available or the cached file is not readable, null
+ * is returned.
+ */
+ public InputStream getResourceContentStream(URL url) {
+ // load the url as resource from the cache
+ File f = getCachePath(url);
+ if (f.exists() && f.canRead()) try {
+ return new BufferedInputStream(new FileInputStream(f));
+ } catch (IOException e) {
+ this.log.logSevere("Unable to create a BufferedInputStream from file " + f,e);
+ return null;
+ }
+ return null;
+ }
+
+ public long getResourceContentLength(URL url) {
+ // load the url as resource from the cache
+ File f = getCachePath(url);
+ if (f.exists() && f.canRead()) {
+ return f.length();
+ }
+ return 0;
+ }
public static boolean isPOST(String urlString) {
return (urlString.indexOf("?") >= 0 ||
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index e339bc0fe..b420b7ffc 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -45,11 +45,13 @@
package de.anomic.plasma;
import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
+import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.Arrays;
@@ -465,16 +467,25 @@ public final class plasmaParser {
} catch (Exception e) {/* ignore this */}
}
- public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source)
+ public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] sourceArray)
throws InterruptedException, ParserException {
- File tempFile = null;
+ ByteArrayInputStream byteIn = null;
try {
- // creating a temp file to store the byte array
- tempFile = File.createTempFile("parseSource", ".tmp");
- serverFileUtils.write(source, tempFile);
+ if (this.theLogger.isFine())
+ this.theLogger.logFine("Parsing '" + location + "' from byte-array");
+
+ // testing if the resource is not empty
+ if (sourceArray == null || sourceArray.length == 0) {
+ String errorMsg = "No resource content available.";
+ this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
+ }
+
+ // creating an InputStream
+ byteIn = new ByteArrayInputStream(sourceArray);
// parsing the temp file
- return parseSource(location, mimeType, charset, tempFile);
+ return parseSource(location, mimeType, charset, sourceArray.length, byteIn);
} catch (Exception e) {
// Interrupted- and Parser-Exceptions should pass through
@@ -482,20 +493,65 @@ public final class plasmaParser {
if (e instanceof ParserException) throw (ParserException) e;
// log unexpected error
- this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e);
+ this.theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location,location, e);
} finally {
- if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */}
+ if (byteIn != null) try { byteIn.close(); } catch (Exception ex){/* ignore this */}
}
}
- public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile)
- throws InterruptedException, ParserException {
+ public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) throws InterruptedException, ParserException {
+
+ BufferedInputStream sourceStream = null;
+ try {
+ if (this.theLogger.isFine())
+ this.theLogger.logFine("Parsing '" + location + "' from file");
+
+ // testing if the resource is not empty
+ if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
+ String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
+ this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
+ }
+
+ // create a new InputStream
+ sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
+
+ // parsing the data
+ return this.parseSource(location, theMimeType, theDocumentCharset, sourceFile.length(), sourceStream);
+
+ } catch (Exception e) {
+ // Interrupted- and Parser-Exceptions should pass through
+ if (e instanceof InterruptedException) throw (InterruptedException) e;
+ if (e instanceof ParserException) throw (ParserException) e;
+ // log unexpected error
+ this.theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
+ throw new ParserException("Unexpected exception while parsing " + location,location, e);
+ } finally {
+ if (sourceStream != null) try { sourceStream.close(); } catch (Exception ex){/* ignore this */}
+ }
+ }
+
+ /**
+ * To parse a resource from an {@link InputStream}
+ * @param location the URL of the resource
+ * @param theMimeType the resource mimetype (null
if unknown)
+ * @param theDocumentCharset the charset of the resource (null
if unknown)
+ * @param contentLength the content length of the resource (-1
if unknown)
+ * @param sourceStream an {@link InputStream} containing the resource body
+ * @return the parsed {@link plasmaParserDocument document}
+ * @throws InterruptedException
+ * @throws ParserException
+ */
+ public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, long contentLength, InputStream sourceStream) throws InterruptedException, ParserException {
Parser theParser = null;
String mimeType = null;
try {
+ if (this.theLogger.isFine())
+ this.theLogger.logFine("Parsing '" + location + "' from stream");
+
// getting the mimetype of the document
mimeType = getRealMimeType(theMimeType);
@@ -513,66 +569,9 @@ public final class plasmaParser {
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
}
- // testing if the resource is not empty
- if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
- String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
- this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
- throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
- }
-
-
if (this.theLogger.isFine())
this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType +
- "' and file extension '" + fileExt + "'.");
-
- /*
- * There are some problematic mimeType - fileExtension combination where we have to enforce
- * a mimeType detection to get the proper parser for the content
- *
- * - application/zip + .odt
- * - text/plain + .odt
- * - text/plain + .vcf
- * - text/xml + .rss
- * - text/xml + .atom
- *
- * In all these cases we can trust the fileExtension and have to determine the proper mimeType.
- *
- */
-
-// // Handling of not trustable mimeTypes
-// // - text/plain
-// // - text/xml
-// // - application/octet-stream
-// // - application/zip
-// if (
-// (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) ||
-// (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt"))
-// ) {
-// if (this.theLogger.isFine())
-// this.theLogger.logFine("Document " + location + " has an mimeType '" + mimeType +
-// "' that seems not to be correct for file extension '" + fileExt + "'.");
-//
-// if (enabledParserList.containsKey("application/octet-stream")) {
-// theParser = this.getParser("application/octet-stream");
-// Object newMime = theParser.getClass().getMethod("getMimeType", new Class[]{File.class}).invoke(theParser, sourceFile);
-// if (newMime == null)
-// if (newMime instanceof String) {
-// String newMimeType = (String)newMime;
-// if ((newMimeType.equals("application/octet-stream")) {
-// return null;
-// }
-// mimeType = newMimeType;
-// }
-// } else {
-// return null;
-// }
-// } else if (mimeType.equalsIgnoreCase("application/zip") && fileExt.equalsIgnoreCase("odt")){
-// if (enabledParserList.containsKey("application/vnd.oasis.opendocument.text")) {
-// mimeType = "application/vnd.oasis.opendocument.text";
-// } else {
-// return null;
-// }
-// }
+ "' and file extension '" + fileExt + "'.");
// getting the correct parser for the given mimeType
theParser = this.getParser(mimeType);
@@ -580,9 +579,12 @@ public final class plasmaParser {
// if a parser was found we use it ...
plasmaParserDocument doc = null;
if (theParser != null) {
- doc = theParser.parse(location, mimeType,documentCharset,sourceFile);
+ // set the content length of the resource
+ theParser.setContentLength(contentLength);
+ // parse the resource
+ doc = theParser.parse(location, mimeType,documentCharset,sourceStream);
} else if (realtimeParsableMimeTypesContains(mimeType)) {
- doc = parseHtml(location, mimeType, documentCharset, sourceFile);
+ doc = parseHtml(location, mimeType, documentCharset, sourceStream);
} else {
String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
@@ -611,14 +613,13 @@ public final class plasmaParser {
if (theParser != null) {
try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { /* ignore this */}
}
- }
+ }
}
- private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException {
+ private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, InputStream sourceStream) throws IOException, ParserException {
// ...otherwise we make a scraper and transformer
- FileInputStream fileIn = new FileInputStream(sourceFile);
- htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false);
+ htmlFilterInputStream htmlFilter = new htmlFilterInputStream(sourceStream,documentCharset,location,null,false);
String charset = htmlFilter.detectCharset();
if (charset == null) {
charset = documentCharset;
@@ -763,7 +764,7 @@ public final class plasmaParser {
//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
try {
- File contentFile = null;
+ Object content = null;
URL contentURL = null;
String contentMimeType = "application/octet-stream";
String charSet = "UTF-8";
@@ -774,17 +775,13 @@ public final class plasmaParser {
String mode = args[0];
if (mode.equalsIgnoreCase("-f")) {
- contentFile = new File(args[1]);
- contentURL = new URL(contentFile);
+ content = new File(args[1]);
+ contentURL = new URL((File)content);
} else if (mode.equalsIgnoreCase("-u")) {
contentURL = new URL(args[1]);
// downloading the document content
- byte[] contentBytes = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null);
-
- contentFile = File.createTempFile("content",".tmp");
- contentFile.deleteOnExit();
- serverFileUtils.write(contentBytes, contentFile);
+ content = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null);
}
if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) {
@@ -805,7 +802,12 @@ public final class plasmaParser {
plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
// parsing the content
- plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);
+ plasmaParserDocument document = null;
+ if (content instanceof byte[]) {
+ document = theParser.parseSource(contentURL, contentMimeType, charSet, (byte[])content);
+ } else if (content instanceof File) {
+ document = theParser.parseSource(contentURL, contentMimeType, charSet, (File)content);
+ }
// printing out all parsed sentences
if (document != null) {
diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java
index 129302433..d6ea1bd9d 100644
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@@ -41,6 +41,7 @@
package de.anomic.plasma;
+import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.Map;
@@ -59,13 +60,18 @@ public final class plasmaSearchImages {
long start = System.currentTimeMillis();
this.images = new TreeSet();
if (maxTime > 10) {
- byte[] res = sc.getResource(url, true, (int) maxTime);
+ Object[] resource = sc.getResource(url, true, (int) maxTime);
+ InputStream res = (InputStream) resource[0];
+ Long resLength = (Long) resource[1];
if (res != null) {
plasmaParserDocument document = null;
try {
- document = sc.parseDocument(url, res);
+ // parse the document
+ document = sc.parseDocument(url, resLength.longValue(), res);
} catch (ParserException e) {
// parsing failed
+ } finally {
+ try { res.close(); } catch (Exception e) {/* ignore this */}
}
if (document == null) return;
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index d2a1f6864..80a63a1a1 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -44,7 +44,9 @@
package de.anomic.plasma;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
@@ -187,46 +189,62 @@ public class plasmaSnippetCache {
* LOADING RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
- byte[] resource = null;
- IResourceInfo docInfo = null;
+ long resContentLength = 0;
+ InputStream resContent = null;
+ IResourceInfo resInfo = null;
try {
// trying to load the resource from the cache
- resource = this.cacheManager.loadResourceContent(url);
+ resContent = this.cacheManager.getResourceContentStream(url);
+ if (resContent != null) {
+ // if the content was found
+ resContentLength = this.cacheManager.getResourceContentLength(url);
+
+ // getting resource metadata
+ resInfo = this.cacheManager.loadResourceInfo(url);
- // if not found try to download it
- if ((resource == null) && (fetchOnline)) {
- // download resource using the crawler
- plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout);
+ } else if (fetchOnline) {
+ // if not found try to download it
- // getting resource metadata (e.g. the http headers for http resources)
- if (entry != null) docInfo = entry.getDocumentInfo();
+ // download resource using the crawler and keep resource in memory if possible
+ plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true);
- // read resource body (if it is there)
- resource = entry.cacheArray();
+ // getting resource metadata (e.g. the http headers for http resources)
+ if (entry != null) {
+ resInfo = entry.getDocumentInfo();
+
+ // read resource body (if it is there)
+ byte []resourceArray = entry.cacheArray();
+ if (resourceArray != null) {
+ resContent = new ByteArrayInputStream(resourceArray);
+ resContentLength = resourceArray.length;
+ } else {
+ resContent = this.cacheManager.getResourceContentStream(url);
+ resContentLength = this.cacheManager.getResourceContentLength(url);
+ }
+ }
- // in case that the reosurce was not in ram, read it from disk
- if (resource == null) resource = this.cacheManager.loadResourceContent(url);
+ // if it is still not available, report an error
+ if (resContent == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
- // if it is still not available, throw exception
- if (resource == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
-
source = SOURCE_WEB;
+ } else {
+ return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
}
} catch (Exception e) {
if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
- }
+ }
- if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
-
/* ===========================================================================
* PARSING RESOURCE
* =========================================================================== */
plasmaParserDocument document = null;
try {
- document = parseDocument(url, resource, docInfo);
+ document = parseDocument(url, resContentLength, resContent, resInfo);
} catch (ParserException e) {
return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
+ } finally {
+ try { resContent.close(); } catch (Exception e) {/* ignore this */}
}
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
@@ -263,30 +281,40 @@ public class plasmaSnippetCache {
* @return the parsed document as {@link plasmaParserDocument}
*/
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
- byte[] resource = null;
IResourceInfo docInfo = null;
try {
// trying to load the resource body from cache
- resource = this.cacheManager.loadResourceContent(url);
+ InputStream content = this.cacheManager.getResourceContentStream(url);
+ long resourceLength = this.cacheManager.getResourceContentLength(url);
// if not available try to load resource from web
- if ((fetchOnline) && (resource == null)) {
+ if ((fetchOnline) && (content == null)) {
// download resource using crawler
- plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
+ plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000, true);
// fetching metadata of the resource (e.g. http headers for http resource)
- if (entry != null) docInfo = entry.getDocumentInfo();
-
- // getting the resource body from the cache
- resource = this.cacheManager.loadResourceContent(url);
+ if (entry != null) {
+ docInfo = entry.getDocumentInfo();
+
+ byte[] resourceArray = entry.cacheArray();
+ if (resourceArray != null) {
+ // read resource body (if it is there)
+ content = new ByteArrayInputStream(resourceArray);
+ resourceLength = resourceArray.length;
+ } else {
+ // in case that the reosurce was not in ram, read it from disk
+ content = this.cacheManager.getResourceContentStream(url);
+ resourceLength = this.cacheManager.getResourceContentLength(url);
+ }
+ }
} else {
// trying to load resource metadata
docInfo = this.cacheManager.loadResourceInfo(url);
}
// parsing document
- if (resource == null) return null;
- return parseDocument(url, resource, docInfo);
+ if (content == null) return null;
+ return parseDocument(url, resourceLength, content, docInfo);
} catch (ParserException e) {
this.log.logWarning("Unable to parse resource. " + e.getMessage());
return null;
@@ -446,15 +474,24 @@ public class plasmaSnippetCache {
return map;
}
- public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException {
- return parseDocument(url, resource, null);
+ public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream) throws ParserException {
+ return parseDocument(url, contentLength, resourceStream, null);
}
- public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException {
+ /**
+ * Parse the resource
+ * @param url the URL of the resource
+ * @param contentLength the contentLength of the resource
+ * @param resourceStream the resource body as stream
+ * @param docInfo metadata about the resource
+ * @return the extracted data
+ * @throws ParserException
+ */
+ public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream, IResourceInfo docInfo) throws ParserException {
try {
- if (resource == null) return null;
+ if (resourceStream == null) return null;
- // if no resource metadata is available, try to load it
+ // STEP 1: if no resource metadata is available, try to load it from cache
if (docInfo == null) {
// try to get the header from the htcache directory
try {
@@ -464,18 +501,21 @@ public class plasmaSnippetCache {
}
}
+ // STEP 2: if the metadata is still null try to download it from web
+ if ((docInfo == null) && (url.getProtocol().startsWith("http"))) {
// TODO: we need a better solution here
- // encapsulate this in the crawlLoader class
- if ((docInfo == null) && (url.getProtocol().startsWith("http"))) {
- // getting URL mimeType
- try {
- httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig);
- docInfo = this.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, header);
- } catch (Exception e) {
- // ingore this. http header download failed
- }
- }
+ // e.g. encapsulate this in the crawlLoader class
+
+ // getting URL mimeType
+ try {
+ httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig);
+ docInfo = this.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, header);
+ } catch (Exception e) {
+ // ingore this. http header download failed
+ }
+ }
+ // STEP 3: if the metadata is still null try to guess the mimeType of the resource
if (docInfo == null) {
String filename = this.cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.');
@@ -495,12 +535,12 @@ public class plasmaSnippetCache {
supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
}
- return this.parser.parseSource(url, supposedMime, null, resource);
+ return this.parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
}
return null;
- }
+ }
if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
- return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), resource);
+ return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), contentLength, resourceStream);
}
return null;
} catch (InterruptedException e) {
@@ -509,27 +549,57 @@ public class plasmaSnippetCache {
}
}
- public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
+ /**
+ *
+ * @param url
+ * @param fetchOnline
+ * @param socketTimeout
+ * @return an Object array containing
+ *
[0] | the content as {@link InputStream} |
[1] | the content-length as {@link Integer} |