diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index f990a530a..79c29238d 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -216,7 +216,7 @@ public class ViewFile {
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
- String content = new String(document.getText());
+ String content = new String(document.getTextBytes());
content = wikiCode.replaceHTML(content); //added by Marc Nause
content = content.replaceAll("\n","
")
.replaceAll("\t"," ");
diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java
index ccebe894d..8168be210 100644
--- a/source/de/anomic/http/httpd.java
+++ b/source/de/anomic/http/httpd.java
@@ -406,24 +406,24 @@ public final class httpd implements serverHandler {
/*
* Handling SOAP Requests here ...
*/
- if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap")) {
- if (soapHandler == null) {
+ if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap/")) {
+ if (this.soapHandler == null) {
try {
Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler");
Constructor classConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } );
- soapHandler = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard });
+ this.soapHandler = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard });
} catch (Exception e) {
- sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+ sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
} catch (NoClassDefFoundError e) {
- sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",null);
+ sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
} catch (Error e) {
- sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+ sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
}
}
- soapHandler.doGet(this.prop, header, this.session.out);
+ this.soapHandler.doGet(this.prop, header, this.session.out);
/*
* Handling HTTP requests here ...
@@ -541,7 +541,7 @@ public final class httpd implements serverHandler {
// we now know the HTTP version. depending on that, we read the header
httpHeader header;
- String httpVersion = prop.getProperty("HTTP", "HTTP/0.9");
+ String httpVersion = this.prop.getProperty("HTTP", "HTTP/0.9");
if (httpVersion.equals("HTTP/0.9")) header = new httpHeader(reverseMappingCache);
else header = httpHeader.readHeader(this.prop,this.session);
@@ -559,8 +559,8 @@ public final class httpd implements serverHandler {
/*
* Handling SOAP Requests here ...
*/
- if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap")) {
- if (soapHandler == null) {
+ if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap/")) {
+ if (this.soapHandler == null) {
try {
// creating the soap handler class by name
Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler");
@@ -569,19 +569,19 @@ public final class httpd implements serverHandler {
Constructor soapHandlerConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } );
// creating the new object
- soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } );
+ this.soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } );
} catch (Exception e) {
- sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+ sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
} catch (NoClassDefFoundError e) {
- sendRespondError(this.prop,this.session.out,4,503,"SOAP Extension not installed","SOAP Extension not installed",null);
+ sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
} catch (Error e) {
- sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+ sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
}
}
- soapHandler.doPost(prop, header, this.session.out, this.session.in);
+ this.soapHandler.doPost(this.prop, header, this.session.out, this.session.in);
/*
* Handling normal HTTP requests here ...
*/
@@ -981,7 +981,7 @@ public final class httpd implements serverHandler {
int httpStatusCode,
String httpStatusText,
String detailedErrorMsg,
- Exception stackTrace
+ Throwable stackTrace
) throws IOException {
sendRespondError(
conProp,
@@ -1004,7 +1004,7 @@ public final class httpd implements serverHandler {
String httpStatusText,
File detailedErrorMsgFile,
serverObjects detailedErrorMsgValues,
- Exception stackTrace
+ Throwable stackTrace
) throws IOException {
sendRespondError(
conProp,
@@ -1029,7 +1029,7 @@ public final class httpd implements serverHandler {
String detailedErrorMsgText,
Object detailedErrorMsgFile,
serverObjects detailedErrorMsgValues,
- Exception stackTrace,
+ Throwable stackTrace,
httpHeader header
) throws IOException {
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index fdaef3ba7..2c7f1d701 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -64,7 +64,7 @@ import de.anomic.server.logging.serverLog;
* @version $LastChangedRevision$ / $LastChangedDate$
*/
public abstract class AbstractParser implements Parser{
-
+
/**
* a list of library names that are needed by this parser
*/
@@ -86,6 +86,12 @@ public abstract class AbstractParser implements Parser{
*/
protected String parserName = this.getClass().getName();
+ /**
+ * The source file file size in bytes if the source document was passed
+ * in as file
+ */
+ protected long fileSize = -1;
+
/**
* The Constructor of this class.
*/
@@ -178,6 +184,9 @@ public abstract class AbstractParser implements Parser{
) throws ParserException, InterruptedException {
BufferedInputStream contentInputStream = null;
try {
+ // getting the file size of the document
+ this.fileSize = sourceFile.length();
+
// create a stream from the file
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java
index 505bac7fb..83d0daa5c 100644
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@@ -60,6 +60,9 @@ import de.anomic.server.logging.serverLog;
*/
public interface Parser {
+
+ public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024;
+
/**
* Parsing a document available as byte array
* @param location the origin of the document
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index 0de357ff5..5f2fca420 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -43,8 +43,11 @@
package de.anomic.plasma.parser.pdf;
+import java.io.File;
+import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
+import java.io.Writer;
import java.util.Hashtable;
import org.pdfbox.pdfparser.PDFParser;
@@ -53,11 +56,12 @@ import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;
import de.anomic.net.URL;
+import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
-import de.anomic.server.serverByteBuffer;
+import de.anomic.server.serverCharBuffer;
public class pdfParser extends AbstractParser implements Parser {
@@ -87,9 +91,9 @@ public class pdfParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
-
PDDocument theDocument = null;
- OutputStreamWriter writer = null;
+ Writer writer = null;
+ File writerFile = null;
try {
// reducing thread priority
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
@@ -114,6 +118,10 @@ public class pdfParser extends AbstractParser implements Parser {
PDFTextStripper stripper = new PDFTextStripper();
theDocument = parser.getPDDocument();
+ if (theDocument.isEncrypted()) {
+ throw new ParserException("Document is encrypted",location,plasmaCrawlEURL.DENIED_DOCUMENT_ENCRYPTED);
+ }
+
// extracting some metadata
PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
if (theDocInfo != null) {
@@ -121,42 +129,54 @@ public class pdfParser extends AbstractParser implements Parser {
docSubject = theDocInfo.getSubject();
//docAuthor = theDocInfo.getAuthor();
docKeywordStr = theDocInfo.getKeywords();
- }
-
- serverByteBuffer out = new serverByteBuffer();
- writer = new OutputStreamWriter( out );
- stripper.writeText(theDocument, writer );
-
- writer.close(); writer = null;
- theDocument.close(); theDocument = null;
+ }
- byte[] contents = out.toByteArray();
- out.close();
- out = null;
-
- if ((docTitle == null) || (docTitle.length() == 0)) {
- docTitle = ((contents.length > 80)? new String(contents, 0, 80, "UTF-8"):new String(contents, "UTF-8")).
- replaceAll("\r\n"," ").
- replaceAll("\n"," ").
- replaceAll("\r"," ").
- replaceAll("\t"," ");
+ // creating a writer for output
+ if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+ writerFile = File.createTempFile("pdfParser",".tmp");
+ writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
+ } else {
+ writer = new serverCharBuffer();
}
+
+ stripper.writeText(theDocument, writer );
+ theDocument.close(); theDocument = null;
+ writer.close();
+
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
- plasmaParserDocument theDoc = new plasmaParserDocument(
- location,
- mimeType,
- "UTF-8",
- docKeywords,
- docSubject,
- docTitle,
- null,
- null,
- contents,
- null,
- null);
+ plasmaParserDocument theDoc = null;
+
+ if (writer instanceof serverCharBuffer) {
+ byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
+ theDoc = new plasmaParserDocument(
+ location,
+ mimeType,
+ "UTF-8",
+ docKeywords,
+ docSubject,
+ docTitle,
+ null,
+ null,
+ contentBytes,
+ null,
+ null);
+ } else {
+ theDoc = new plasmaParserDocument(
+ location,
+ mimeType,
+ "UTF-8",
+ docKeywords,
+ docSubject,
+ docTitle,
+ null,
+ null,
+ writerFile,
+ null,
+ null);
+ }
return theDoc;
}
@@ -164,6 +184,12 @@ public class pdfParser extends AbstractParser implements Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
+ // close the writer
+ if (writer != null) try { writer.close(); } catch (Exception ex) {/* ignore this */}
+
+ // delete the file
+ if (writerFile != null) try { writerFile.delete(); } catch (Exception ex) {/* ignore this */}
+
throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location);
} finally {
if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */}
@@ -173,8 +199,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
public void reset() {
- // Nothing todo here at the moment
-
+ this.fileSize = -1;
}
}
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index 98bcfda5c..c054f079e 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -80,8 +80,7 @@ implements Parser {
this.parserName = "Rich Text Format Parser";
}
- public plasmaParserDocument parse(URL location, String mimeType, String charset,
- InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
@@ -105,7 +104,7 @@ implements Parser {
null,
null,
null,
- bodyText.getBytes(),
+ bodyText.getBytes("UTF-8"),
null,
null);
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index 023f38b40..4d3ff6860 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -43,8 +43,11 @@
package de.anomic.plasma.parser.tar;
+import java.io.BufferedOutputStream;
import java.io.File;
+import java.io.FileOutputStream;
import java.io.InputStream;
+import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
@@ -96,7 +99,18 @@ public class tarParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
+ long docTextLength = 0;
+ OutputStream docText = null;
+ File outputFile = null;
+ plasmaParserDocument subDoc = null;
try {
+ if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+ outputFile = File.createTempFile("zipParser",".tmp");
+ docText = new BufferedOutputStream(new FileOutputStream(outputFile));
+ } else {
+ docText = new serverByteBuffer();
+ }
+
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
@@ -116,7 +130,7 @@ public class tarParser extends AbstractParser implements Parser {
StringBuffer docLongTitle = new StringBuffer();
LinkedList docSections = new LinkedList();
StringBuffer docAbstrct = new StringBuffer();
- serverByteBuffer docText = new serverByteBuffer();
+
Map docAnchors = new HashMap();
TreeSet docImages = new TreeSet();
@@ -141,55 +155,58 @@ public class tarParser extends AbstractParser implements Parser {
String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
// getting the entry content
- plasmaParserDocument theDoc = null;
- File tempFile = null;
+ File subDocTempFile = null;
try {
// create the temp file
- tempFile = createTempFile(entryName);
+ subDocTempFile = createTempFile(entryName);
// copy the data into the file
- serverFileUtils.copy(tin,tempFile,entry.getSize());
+ serverFileUtils.copy(tin,subDocTempFile,entry.getSize());
// check for interruption
checkInterruption();
// parsing the content
- theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile);
+ subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,subDocTempFile);
} catch (ParserException e) {
this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage());
} finally {
- if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
+ if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */}
}
- if (theDoc == null) continue;
+ if (subDoc == null) continue;
// merging all documents together
if (docKeywords.length() > 0) docKeywords.append(",");
- docKeywords.append(theDoc.getKeywords(','));
+ docKeywords.append(subDoc.getKeywords(','));
if (docLongTitle.length() > 0) docLongTitle.append("\n");
- docLongTitle.append(theDoc.getMainLongTitle());
+ docLongTitle.append(subDoc.getMainLongTitle());
if (docShortTitle.length() > 0) docShortTitle.append("\n");
- docShortTitle.append(theDoc.getMainShortTitle());
+ docShortTitle.append(subDoc.getMainShortTitle());
- docSections.addAll(Arrays.asList(theDoc.getSectionTitles()));
+ docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
if (docAbstrct.length() > 0) docAbstrct.append("\n");
- docAbstrct.append(theDoc.getAbstract());
+ docAbstrct.append(subDoc.getAbstract());
- if (docText.length() > 0) docText.append("\n");
- docText.append(theDoc.getText());
+ if (subDoc.getTextLength() > 0) {
+ if (docTextLength > 0) docText.write('\n');
+ docTextLength += serverFileUtils.copy(subDoc.getText(), docText);
+ }
+
+ docAnchors.putAll(subDoc.getAnchors());
+ docImages.addAll(subDoc.getImages());
- docAnchors.putAll(theDoc.getAnchors());
- docImages.addAll(theDoc.getImages());
+ // release subdocument
+ subDoc.close();
+ subDoc = null;
}
- /* (URL location, String mimeType,
- String keywords, String shortTitle, String longTitle,
- String[] sections, String abstrct,
- byte[] text, Map anchors, Map images)
- */
- return new plasmaParserDocument(
+ plasmaParserDocument result = null;
+
+ if (docText instanceof serverByteBuffer) {
+ result = new plasmaParserDocument(
location,
mimeType,
null,
@@ -198,13 +215,37 @@ public class tarParser extends AbstractParser implements Parser {
docLongTitle.toString(),
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
- docText.toByteArray(),
+ ((serverByteBuffer)docText).toByteArray(),
docAnchors,
docImages);
+ } else {
+ result = new plasmaParserDocument(
+ location,
+ mimeType,
+ null,
+ docKeywords.toString().split(" |,"),
+ docShortTitle.toString(),
+ docLongTitle.toString(),
+ (String[])docSections.toArray(new String[docSections.size()]),
+ docAbstrct.toString(),
+ outputFile,
+ docAnchors,
+ docImages);
+ }
+
+ return result;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
+ if (subDoc != null) subDoc.close();
+
+ // close the writer
+ if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */}
+
+ // delete the file
+ if (outputFile != null) try { outputFile.delete(); } catch (Exception ex) {/* ignore this */}
+
throw new ParserException("Unexpected error while parsing tar resource. " + e.getMessage(),location);
}
}
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index aca8f6505..8a523dbcf 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -43,8 +43,11 @@
package de.anomic.plasma.parser.zip;
+import java.io.BufferedOutputStream;
import java.io.File;
+import java.io.FileOutputStream;
import java.io.InputStream;
+import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
@@ -94,13 +97,23 @@ public class zipParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
+ long docTextLength = 0;
+ OutputStream docText = null;
+ File outputFile = null;
+ plasmaParserDocument subDoc = null;
try {
+ if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+ outputFile = File.createTempFile("zipParser",".tmp");
+ docText = new BufferedOutputStream(new FileOutputStream(outputFile));
+ } else {
+ docText = new serverByteBuffer();
+ }
+
StringBuffer docKeywords = new StringBuffer();
StringBuffer docShortTitle = new StringBuffer();
StringBuffer docLongTitle = new StringBuffer();
LinkedList docSections = new LinkedList();
StringBuffer docAbstrct = new StringBuffer();
- serverByteBuffer docText = new serverByteBuffer();
Map docAnchors = new HashMap();
TreeSet docImages = new TreeSet();
@@ -128,48 +141,56 @@ public class zipParser extends AbstractParser implements Parser {
String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
// parsing the content
- plasmaParserDocument theDoc = null;
- File tempFile = null;
+ File subDocTempFile = null;
try {
// create the temp file
- tempFile = createTempFile(entryName);
+ subDocTempFile = createTempFile(entryName);
// copy the data into the file
- serverFileUtils.copy(zippedContent,tempFile,entry.getSize());
+ serverFileUtils.copy(zippedContent,subDocTempFile,entry.getSize());
// parsing the zip file entry
- theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile);
+ subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, subDocTempFile);
} catch (ParserException e) {
this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage());
} finally {
- if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
+ if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */}
}
- if (theDoc == null) continue;
+ if (subDoc == null) continue;
// merging all documents together
if (docKeywords.length() > 0) docKeywords.append(",");
- docKeywords.append(theDoc.getKeywords(','));
+ docKeywords.append(subDoc.getKeywords(','));
if (docLongTitle.length() > 0) docLongTitle.append("\n");
- docLongTitle.append(theDoc.getMainLongTitle());
+ docLongTitle.append(subDoc.getMainLongTitle());
if (docShortTitle.length() > 0) docShortTitle.append("\n");
- docShortTitle.append(theDoc.getMainShortTitle());
+ docShortTitle.append(subDoc.getMainShortTitle());
- docSections.addAll(Arrays.asList(theDoc.getSectionTitles()));
+ docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
if (docAbstrct.length() > 0) docAbstrct.append("\n");
- docAbstrct.append(theDoc.getAbstract());
+ docAbstrct.append(subDoc.getAbstract());
- if (docText.length() > 0) docText.append("\n");
- docText.append(theDoc.getText());
+ if (subDoc.getTextLength() > 0) {
+ if (docTextLength > 0) docText.write('\n');
+ docTextLength += serverFileUtils.copy(subDoc.getText(), docText);
+ }
- docAnchors.putAll(theDoc.getAnchors());
- docImages.addAll(theDoc.getImages());
+ docAnchors.putAll(subDoc.getAnchors());
+ docImages.addAll(subDoc.getImages());
+
+ // release subdocument
+ subDoc.close();
+ subDoc = null;
}
- return new plasmaParserDocument(
+ plasmaParserDocument result = null;
+
+ if (docText instanceof serverByteBuffer) {
+ result = new plasmaParserDocument(
location,
mimeType,
null,
@@ -178,13 +199,37 @@ public class zipParser extends AbstractParser implements Parser {
docLongTitle.toString(),
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
- docText.toByteArray(),
+ ((serverByteBuffer)docText).toByteArray(),
docAnchors,
docImages);
+ } else {
+ result = new plasmaParserDocument(
+ location,
+ mimeType,
+ null,
+ docKeywords.toString().split(" |,"),
+ docShortTitle.toString(),
+ docLongTitle.toString(),
+ (String[])docSections.toArray(new String[docSections.size()]),
+ docAbstrct.toString(),
+ outputFile,
+ docAnchors,
+ docImages);
+ }
+
+ return result;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
+ if (subDoc != null) subDoc.close();
+
+ // close the writer
+ if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */}
+
+ // delete the file
+ if (outputFile != null) try { outputFile.delete(); } catch (Exception ex) {/* ignore this */}
+
throw new ParserException("Unexpected error while parsing zip resource. " + e.getClass().getName() + ": "+ e.getMessage(),location);
}
}
diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java
index ba6d7d43e..c05f08e5b 100644
--- a/source/de/anomic/plasma/plasmaCrawlEURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlEURL.java
@@ -117,6 +117,7 @@ public class plasmaCrawlEURL extends indexURL {
// Parser errors
public static final String DENIED_PARSER_ERROR = "denied_(parser_error)";
+ public static final String DENIED_DOCUMENT_ENCRYPTED = "denied_(document_encrypted)";
public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)";
// indexing errors
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 536307b8a..b0038249f 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -515,7 +515,7 @@ public final class plasmaParser {
// testing if the resource is not empty
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
- String errorMsg = "No resource content available.";
+ String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
}
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index c8de2b6af..0ca9bb8a1 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -42,8 +42,14 @@
package de.anomic.plasma;
+import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
import java.net.MalformedURLException;
+import de.anomic.server.serverFileUtils;
+
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@@ -62,7 +68,7 @@ public class plasmaParserDocument {
String longTitle; // the real title of the document, commonly h1-tags
String[] sections; // if present: more titles/headlines appearing in the document
String abstrct; // an abstract, if present: short content description
- byte[] text; // the clear text, all that is visible
+ private Object text; // the clear text, all that is visible
Map anchors; // all links embedded as clickeable entities (anchor tags)
TreeSet images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
@@ -95,6 +101,29 @@ public class plasmaParserDocument {
this.condenser = null;
this.resorted = false;
}
+
+ public plasmaParserDocument(URL location, String mimeType, String charset,
+ String[] keywords, String shortTitle, String longTitle,
+ String[] sections, String abstrct,
+ File text, Map anchors, TreeSet images) {
+ this.location = location;
+ this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
+ this.charset = charset;
+ this.keywords = (keywords==null) ? new String[0] : keywords;
+ this.shortTitle = (shortTitle==null)?"":shortTitle;
+ this.longTitle = (longTitle==null)?"":longTitle;
+ this.sections = (sections==null)?new String[0]:sections;
+ this.abstrct = (abstrct==null)?"":abstrct;
+ this.text = text;
+ if (text != null) text.deleteOnExit();
+ this.anchors = (anchors==null)?new HashMap(0):anchors;
+ this.images = (images==null)?new TreeSet():images;
+ this.hyperlinks = null;
+ this.medialinks = null;
+ this.emaillinks = null;
+ this.condenser = null;
+ this.resorted = false;
+ }
public String getMimeType() {
return this.mimeType;
@@ -103,7 +132,7 @@ public class plasmaParserDocument {
/**
* @return the supposed charset of this document or null
if unknown
*/
- public String getCharset() {
+ public String getSourceCharset() {
return this.charset;
}
@@ -123,13 +152,41 @@ public class plasmaParserDocument {
if (abstrct != null) return abstrct; else return getMainLongTitle();
}
- public byte[] getText() {
- // returns only the clear (visible) text (not the source data)
- return text;
+ public InputStream getText() {
+ try {
+ if (this.text == null) return null;
+
+ if (this.text instanceof File) return new BufferedInputStream(new FileInputStream((File)this.text));
+ else if (this.text instanceof byte[]) return new ByteArrayInputStream((byte[])this.text);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ public byte[] getTextBytes() {
+ try {
+ if (this.text == null) return new byte[0];
+
+ if (this.text instanceof File) return serverFileUtils.read((File)this.text);
+ else if (this.text instanceof byte[]) return (byte[])this.text;
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ return new byte[0];
+ }
+
+ public long getTextLength() {
+ if (this.text == null) return 0;
+ if (this.text instanceof File) return ((File)this.text).length();
+ else if (this.text instanceof byte[]) return ((byte[])this.text).length;
+
+ return -1;
}
public plasmaCondenser getCondenser() {
- if (condenser == null) condenser = new plasmaCondenser(new ByteArrayInputStream(getText()), 0, 0);
+ if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
return condenser;
}
@@ -262,4 +319,16 @@ public class plasmaParserDocument {
this.resorted = true;
}
+ public void close() {
+ // delete the temp file
+ if ((this.text != null) && (this.text instanceof File)) {
+ try { ((File)this.text).delete(); } catch (Exception e) {/* ignore this */}
+ }
+ }
+
+ protected void finalize() throws Throwable {
+ this.close();
+ super.finalize();
+ }
+
}
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 4e0fba5ba..4d37c31b7 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1416,6 +1416,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
+ plasmaParserDocument document = null;
try {
// work off one stack entry with a fresh resource
long stackStartTime = 0, stackEndTime = 0,
@@ -1456,7 +1457,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* =========================================================================
* PARSE CONTENT
* ========================================================================= */
- plasmaParserDocument document = null;
parsingStartTime = System.currentTimeMillis();
try {
@@ -1527,7 +1527,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
- plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
+ plasmaCondenser condenser = new plasmaCondenser(document.getText());
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser);
@@ -1700,8 +1700,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" +
"\n\tDescription: " + docDescription +
- "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
- "Size: " + document.text.length + " bytes | " +
+ "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " +
+ "Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
"\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " +
"ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " +
@@ -1744,6 +1744,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
}
}
+ document.close();
document = null;
} catch (Exception e) {
this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
@@ -1772,6 +1773,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
cacheManager.deleteFile(entry.url());
}
entry = null;
+
+ if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ }
}
}
@@ -1807,7 +1810,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) + // count of links to global resources
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) + // count of links to other documents
- kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getText().length, 3) + // length of plain text in bytes
+ kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) + // length of plain text in bytes
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_SIMI_WORDS, 3) + // count of all unique words
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
@@ -2173,7 +2176,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
try {
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
- Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
+ Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes());
// delete all word references
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself