From cd5f349666ba2d9cc3b3d9cd5488b72333dabb76 Mon Sep 17 00:00:00 2001 From: theli Date: Sat, 30 Sep 2006 09:31:53 +0000 Subject: [PATCH] *) Better handling of large files during parsing Extracted text of files that are larger than 5MB is stored in a temp file instead of keeping it in memory *) plasmaParserDocument.java; getText now returnes an inputStream instead of a byte array *) plasmaParserDocument.java: new function getTextBytes returns the parsed content as byte array Attention: the caller of this function has to ensure that enough memory is available to do this to avoid OutOfMemory Exceptions *) httpd.java: better error handling if the soaphander is not installed *) pdfParser.java: - better handling of documents with exotic charsets - better handling of large documents - better error logging of encrypted documents *) rtfParser.java: Bugfix for UTF-8 support *) tarParser.java: better handling of large documents *) zipParser.java: better handling of large documents *) plasmaCrawlEURL.java: new errorcode for encrypted documents *) plasmaParserDocument.java: the extracted text can now be passed to this object as byte array or temp file git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2679 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ViewFile.java | 2 +- source/de/anomic/http/httpd.java | 36 +++---- .../anomic/plasma/parser/AbstractParser.java | 11 ++- source/de/anomic/plasma/parser/Parser.java | 3 + .../anomic/plasma/parser/pdf/pdfParser.java | 95 ++++++++++++------- .../anomic/plasma/parser/rtf/rtfParser.java | 5 +- .../anomic/plasma/parser/tar/tarParser.java | 89 ++++++++++++----- .../anomic/plasma/parser/zip/zipParser.java | 83 ++++++++++++---- source/de/anomic/plasma/plasmaCrawlEURL.java | 1 + source/de/anomic/plasma/plasmaParser.java | 2 +- .../anomic/plasma/plasmaParserDocument.java | 81 ++++++++++++++-- .../de/anomic/plasma/plasmaSwitchboard.java | 15 +-- 12 files changed, 309 insertions(+), 114 deletions(-) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index f990a530a..79c29238d 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -216,7 +216,7 @@ public class ViewFile { resMime = document.getMimeType(); if (viewMode.equals("parsed")) { - String content = new String(document.getText()); + String content = new String(document.getTextBytes()); content = wikiCode.replaceHTML(content); //added by Marc Nause content = content.replaceAll("\n","
") .replaceAll("\t","    "); diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java index ccebe894d..8168be210 100644 --- a/source/de/anomic/http/httpd.java +++ b/source/de/anomic/http/httpd.java @@ -406,24 +406,24 @@ public final class httpd implements serverHandler { /* * Handling SOAP Requests here ... */ - if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap")) { - if (soapHandler == null) { + if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap/")) { + if (this.soapHandler == null) { try { Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler"); Constructor classConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } ); - soapHandler = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard }); + this.soapHandler = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard }); } catch (Exception e) { - sendRespondHeader(this.prop,this.session.out,httpVersion,503,null); + sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e); return serverCore.TERMINATE_CONNECTION; } catch (NoClassDefFoundError e) { - sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",null); + sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e); return serverCore.TERMINATE_CONNECTION; } catch (Error e) { - sendRespondHeader(this.prop,this.session.out,httpVersion,503,null); + sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e); return serverCore.TERMINATE_CONNECTION; } } - soapHandler.doGet(this.prop, header, this.session.out); + this.soapHandler.doGet(this.prop, header, this.session.out); /* * Handling HTTP requests here ... @@ -541,7 +541,7 @@ public final class httpd implements serverHandler { // we now know the HTTP version. depending on that, we read the header httpHeader header; - String httpVersion = prop.getProperty("HTTP", "HTTP/0.9"); + String httpVersion = this.prop.getProperty("HTTP", "HTTP/0.9"); if (httpVersion.equals("HTTP/0.9")) header = new httpHeader(reverseMappingCache); else header = httpHeader.readHeader(this.prop,this.session); @@ -559,8 +559,8 @@ public final class httpd implements serverHandler { /* * Handling SOAP Requests here ... */ - if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap")) { - if (soapHandler == null) { + if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap/")) { + if (this.soapHandler == null) { try { // creating the soap handler class by name Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler"); @@ -569,19 +569,19 @@ public final class httpd implements serverHandler { Constructor soapHandlerConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } ); // creating the new object - soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } ); + this.soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } ); } catch (Exception e) { - sendRespondHeader(this.prop,this.session.out,httpVersion,503,null); + sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e); return serverCore.TERMINATE_CONNECTION; } catch (NoClassDefFoundError e) { - sendRespondError(this.prop,this.session.out,4,503,"SOAP Extension not installed","SOAP Extension not installed",null); + sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e); return serverCore.TERMINATE_CONNECTION; } catch (Error e) { - sendRespondHeader(this.prop,this.session.out,httpVersion,503,null); + sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e); return serverCore.TERMINATE_CONNECTION; } } - soapHandler.doPost(prop, header, this.session.out, this.session.in); + this.soapHandler.doPost(this.prop, header, this.session.out, this.session.in); /* * Handling normal HTTP requests here ... */ @@ -981,7 +981,7 @@ public final class httpd implements serverHandler { int httpStatusCode, String httpStatusText, String detailedErrorMsg, - Exception stackTrace + Throwable stackTrace ) throws IOException { sendRespondError( conProp, @@ -1004,7 +1004,7 @@ public final class httpd implements serverHandler { String httpStatusText, File detailedErrorMsgFile, serverObjects detailedErrorMsgValues, - Exception stackTrace + Throwable stackTrace ) throws IOException { sendRespondError( conProp, @@ -1029,7 +1029,7 @@ public final class httpd implements serverHandler { String detailedErrorMsgText, Object detailedErrorMsgFile, serverObjects detailedErrorMsgValues, - Exception stackTrace, + Throwable stackTrace, httpHeader header ) throws IOException { diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java index fdaef3ba7..2c7f1d701 100644 --- a/source/de/anomic/plasma/parser/AbstractParser.java +++ b/source/de/anomic/plasma/parser/AbstractParser.java @@ -64,7 +64,7 @@ import de.anomic.server.logging.serverLog; * @version $LastChangedRevision$ / $LastChangedDate$ */ public abstract class AbstractParser implements Parser{ - + /** * a list of library names that are needed by this parser */ @@ -86,6 +86,12 @@ public abstract class AbstractParser implements Parser{ */ protected String parserName = this.getClass().getName(); + /** + * The source file file size in bytes if the source document was passed + * in as file + */ + protected long fileSize = -1; + /** * The Constructor of this class. */ @@ -178,6 +184,9 @@ public abstract class AbstractParser implements Parser{ ) throws ParserException, InterruptedException { BufferedInputStream contentInputStream = null; try { + // getting the file size of the document + this.fileSize = sourceFile.length(); + // create a stream from the file contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java index 505bac7fb..83d0daa5c 100644 --- a/source/de/anomic/plasma/parser/Parser.java +++ b/source/de/anomic/plasma/parser/Parser.java @@ -60,6 +60,9 @@ import de.anomic.server.logging.serverLog; */ public interface Parser { + + public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024; + /** * Parsing a document available as byte array * @param location the origin of the document diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 0de357ff5..5f2fca420 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -43,8 +43,11 @@ package de.anomic.plasma.parser.pdf; +import java.io.File; +import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStreamWriter; +import java.io.Writer; import java.util.Hashtable; import org.pdfbox.pdfparser.PDFParser; @@ -53,11 +56,12 @@ import org.pdfbox.pdmodel.PDDocumentInformation; import org.pdfbox.util.PDFTextStripper; import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; -import de.anomic.server.serverByteBuffer; +import de.anomic.server.serverCharBuffer; public class pdfParser extends AbstractParser implements Parser { @@ -87,9 +91,9 @@ public class pdfParser extends AbstractParser implements Parser { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { - PDDocument theDocument = null; - OutputStreamWriter writer = null; + Writer writer = null; + File writerFile = null; try { // reducing thread priority Thread.currentThread().setPriority(Thread.MIN_PRIORITY); @@ -114,6 +118,10 @@ public class pdfParser extends AbstractParser implements Parser { PDFTextStripper stripper = new PDFTextStripper(); theDocument = parser.getPDDocument(); + if (theDocument.isEncrypted()) { + throw new ParserException("Document is encrypted",location,plasmaCrawlEURL.DENIED_DOCUMENT_ENCRYPTED); + } + // extracting some metadata PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); if (theDocInfo != null) { @@ -121,42 +129,54 @@ public class pdfParser extends AbstractParser implements Parser { docSubject = theDocInfo.getSubject(); //docAuthor = theDocInfo.getAuthor(); docKeywordStr = theDocInfo.getKeywords(); - } - - serverByteBuffer out = new serverByteBuffer(); - writer = new OutputStreamWriter( out ); - stripper.writeText(theDocument, writer ); - - writer.close(); writer = null; - theDocument.close(); theDocument = null; + } - byte[] contents = out.toByteArray(); - out.close(); - out = null; - - if ((docTitle == null) || (docTitle.length() == 0)) { - docTitle = ((contents.length > 80)? new String(contents, 0, 80, "UTF-8"):new String(contents, "UTF-8")). - replaceAll("\r\n"," "). - replaceAll("\n"," "). - replaceAll("\r"," "). - replaceAll("\t"," "); + // creating a writer for output + if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + writerFile = File.createTempFile("pdfParser",".tmp"); + writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); + } else { + writer = new serverCharBuffer(); } + + stripper.writeText(theDocument, writer ); + theDocument.close(); theDocument = null; + writer.close(); + String[] docKeywords = null; if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); - plasmaParserDocument theDoc = new plasmaParserDocument( - location, - mimeType, - "UTF-8", - docKeywords, - docSubject, - docTitle, - null, - null, - contents, - null, - null); + plasmaParserDocument theDoc = null; + + if (writer instanceof serverCharBuffer) { + byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8"); + theDoc = new plasmaParserDocument( + location, + mimeType, + "UTF-8", + docKeywords, + docSubject, + docTitle, + null, + null, + contentBytes, + null, + null); + } else { + theDoc = new plasmaParserDocument( + location, + mimeType, + "UTF-8", + docKeywords, + docSubject, + docTitle, + null, + null, + writerFile, + null, + null); + } return theDoc; } @@ -164,6 +184,12 @@ public class pdfParser extends AbstractParser implements Parser { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; + // close the writer + if (writer != null) try { writer.close(); } catch (Exception ex) {/* ignore this */} + + // delete the file + if (writerFile != null) try { writerFile.delete(); } catch (Exception ex) {/* ignore this */} + throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location); } finally { if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */} @@ -173,8 +199,7 @@ public class pdfParser extends AbstractParser implements Parser { } public void reset() { - // Nothing todo here at the moment - + this.fileSize = -1; } } diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index 98bcfda5c..c054f079e 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -80,8 +80,7 @@ implements Parser { this.parserName = "Rich Text Format Parser"; } - public plasmaParserDocument parse(URL location, String mimeType, String charset, - InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { try { @@ -105,7 +104,7 @@ implements Parser { null, null, null, - bodyText.getBytes(), + bodyText.getBytes("UTF-8"), null, null); diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 023f38b40..4d3ff6860 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -43,8 +43,11 @@ package de.anomic.plasma.parser.tar; +import java.io.BufferedOutputStream; import java.io.File; +import java.io.FileOutputStream; import java.io.InputStream; +import java.io.OutputStream; import java.util.Arrays; import java.util.HashMap; import java.util.Hashtable; @@ -96,7 +99,18 @@ public class tarParser extends AbstractParser implements Parser { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { + long docTextLength = 0; + OutputStream docText = null; + File outputFile = null; + plasmaParserDocument subDoc = null; try { + if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + outputFile = File.createTempFile("zipParser",".tmp"); + docText = new BufferedOutputStream(new FileOutputStream(outputFile)); + } else { + docText = new serverByteBuffer(); + } + // creating a new parser class to parse the unzipped content plasmaParser theParser = new plasmaParser(); @@ -116,7 +130,7 @@ public class tarParser extends AbstractParser implements Parser { StringBuffer docLongTitle = new StringBuffer(); LinkedList docSections = new LinkedList(); StringBuffer docAbstrct = new StringBuffer(); - serverByteBuffer docText = new serverByteBuffer(); + Map docAnchors = new HashMap(); TreeSet docImages = new TreeSet(); @@ -141,55 +155,58 @@ public class tarParser extends AbstractParser implements Parser { String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt); // getting the entry content - plasmaParserDocument theDoc = null; - File tempFile = null; + File subDocTempFile = null; try { // create the temp file - tempFile = createTempFile(entryName); + subDocTempFile = createTempFile(entryName); // copy the data into the file - serverFileUtils.copy(tin,tempFile,entry.getSize()); + serverFileUtils.copy(tin,subDocTempFile,entry.getSize()); // check for interruption checkInterruption(); // parsing the content - theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile); + subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,subDocTempFile); } catch (ParserException e) { this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage()); } finally { - if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */} + if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */} } - if (theDoc == null) continue; + if (subDoc == null) continue; // merging all documents together if (docKeywords.length() > 0) docKeywords.append(","); - docKeywords.append(theDoc.getKeywords(',')); + docKeywords.append(subDoc.getKeywords(',')); if (docLongTitle.length() > 0) docLongTitle.append("\n"); - docLongTitle.append(theDoc.getMainLongTitle()); + docLongTitle.append(subDoc.getMainLongTitle()); if (docShortTitle.length() > 0) docShortTitle.append("\n"); - docShortTitle.append(theDoc.getMainShortTitle()); + docShortTitle.append(subDoc.getMainShortTitle()); - docSections.addAll(Arrays.asList(theDoc.getSectionTitles())); + docSections.addAll(Arrays.asList(subDoc.getSectionTitles())); if (docAbstrct.length() > 0) docAbstrct.append("\n"); - docAbstrct.append(theDoc.getAbstract()); + docAbstrct.append(subDoc.getAbstract()); - if (docText.length() > 0) docText.append("\n"); - docText.append(theDoc.getText()); + if (subDoc.getTextLength() > 0) { + if (docTextLength > 0) docText.write('\n'); + docTextLength += serverFileUtils.copy(subDoc.getText(), docText); + } + + docAnchors.putAll(subDoc.getAnchors()); + docImages.addAll(subDoc.getImages()); - docAnchors.putAll(theDoc.getAnchors()); - docImages.addAll(theDoc.getImages()); + // release subdocument + subDoc.close(); + subDoc = null; } - /* (URL location, String mimeType, - String keywords, String shortTitle, String longTitle, - String[] sections, String abstrct, - byte[] text, Map anchors, Map images) - */ - return new plasmaParserDocument( + plasmaParserDocument result = null; + + if (docText instanceof serverByteBuffer) { + result = new plasmaParserDocument( location, mimeType, null, @@ -198,13 +215,37 @@ public class tarParser extends AbstractParser implements Parser { docLongTitle.toString(), (String[])docSections.toArray(new String[docSections.size()]), docAbstrct.toString(), - docText.toByteArray(), + ((serverByteBuffer)docText).toByteArray(), docAnchors, docImages); + } else { + result = new plasmaParserDocument( + location, + mimeType, + null, + docKeywords.toString().split(" |,"), + docShortTitle.toString(), + docLongTitle.toString(), + (String[])docSections.toArray(new String[docSections.size()]), + docAbstrct.toString(), + outputFile, + docAnchors, + docImages); + } + + return result; } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; + if (subDoc != null) subDoc.close(); + + // close the writer + if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */} + + // delete the file + if (outputFile != null) try { outputFile.delete(); } catch (Exception ex) {/* ignore this */} + throw new ParserException("Unexpected error while parsing tar resource. " + e.getMessage(),location); } } diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index aca8f6505..8a523dbcf 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -43,8 +43,11 @@ package de.anomic.plasma.parser.zip; +import java.io.BufferedOutputStream; import java.io.File; +import java.io.FileOutputStream; import java.io.InputStream; +import java.io.OutputStream; import java.util.Arrays; import java.util.HashMap; import java.util.Hashtable; @@ -94,13 +97,23 @@ public class zipParser extends AbstractParser implements Parser { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { + long docTextLength = 0; + OutputStream docText = null; + File outputFile = null; + plasmaParserDocument subDoc = null; try { + if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + outputFile = File.createTempFile("zipParser",".tmp"); + docText = new BufferedOutputStream(new FileOutputStream(outputFile)); + } else { + docText = new serverByteBuffer(); + } + StringBuffer docKeywords = new StringBuffer(); StringBuffer docShortTitle = new StringBuffer(); StringBuffer docLongTitle = new StringBuffer(); LinkedList docSections = new LinkedList(); StringBuffer docAbstrct = new StringBuffer(); - serverByteBuffer docText = new serverByteBuffer(); Map docAnchors = new HashMap(); TreeSet docImages = new TreeSet(); @@ -128,48 +141,56 @@ public class zipParser extends AbstractParser implements Parser { String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt); // parsing the content - plasmaParserDocument theDoc = null; - File tempFile = null; + File subDocTempFile = null; try { // create the temp file - tempFile = createTempFile(entryName); + subDocTempFile = createTempFile(entryName); // copy the data into the file - serverFileUtils.copy(zippedContent,tempFile,entry.getSize()); + serverFileUtils.copy(zippedContent,subDocTempFile,entry.getSize()); // parsing the zip file entry - theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile); + subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, subDocTempFile); } catch (ParserException e) { this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage()); } finally { - if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */} + if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */} } - if (theDoc == null) continue; + if (subDoc == null) continue; // merging all documents together if (docKeywords.length() > 0) docKeywords.append(","); - docKeywords.append(theDoc.getKeywords(',')); + docKeywords.append(subDoc.getKeywords(',')); if (docLongTitle.length() > 0) docLongTitle.append("\n"); - docLongTitle.append(theDoc.getMainLongTitle()); + docLongTitle.append(subDoc.getMainLongTitle()); if (docShortTitle.length() > 0) docShortTitle.append("\n"); - docShortTitle.append(theDoc.getMainShortTitle()); + docShortTitle.append(subDoc.getMainShortTitle()); - docSections.addAll(Arrays.asList(theDoc.getSectionTitles())); + docSections.addAll(Arrays.asList(subDoc.getSectionTitles())); if (docAbstrct.length() > 0) docAbstrct.append("\n"); - docAbstrct.append(theDoc.getAbstract()); + docAbstrct.append(subDoc.getAbstract()); - if (docText.length() > 0) docText.append("\n"); - docText.append(theDoc.getText()); + if (subDoc.getTextLength() > 0) { + if (docTextLength > 0) docText.write('\n'); + docTextLength += serverFileUtils.copy(subDoc.getText(), docText); + } - docAnchors.putAll(theDoc.getAnchors()); - docImages.addAll(theDoc.getImages()); + docAnchors.putAll(subDoc.getAnchors()); + docImages.addAll(subDoc.getImages()); + + // release subdocument + subDoc.close(); + subDoc = null; } - return new plasmaParserDocument( + plasmaParserDocument result = null; + + if (docText instanceof serverByteBuffer) { + result = new plasmaParserDocument( location, mimeType, null, @@ -178,13 +199,37 @@ public class zipParser extends AbstractParser implements Parser { docLongTitle.toString(), (String[])docSections.toArray(new String[docSections.size()]), docAbstrct.toString(), - docText.toByteArray(), + ((serverByteBuffer)docText).toByteArray(), docAnchors, docImages); + } else { + result = new plasmaParserDocument( + location, + mimeType, + null, + docKeywords.toString().split(" |,"), + docShortTitle.toString(), + docLongTitle.toString(), + (String[])docSections.toArray(new String[docSections.size()]), + docAbstrct.toString(), + outputFile, + docAnchors, + docImages); + } + + return result; } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; + if (subDoc != null) subDoc.close(); + + // close the writer + if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */} + + // delete the file + if (outputFile != null) try { outputFile.delete(); } catch (Exception ex) {/* ignore this */} + throw new ParserException("Unexpected error while parsing zip resource. " + e.getClass().getName() + ": "+ e.getMessage(),location); } } diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index ba6d7d43e..c05f08e5b 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -117,6 +117,7 @@ public class plasmaCrawlEURL extends indexURL { // Parser errors public static final String DENIED_PARSER_ERROR = "denied_(parser_error)"; + public static final String DENIED_DOCUMENT_ENCRYPTED = "denied_(document_encrypted)"; public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)"; // indexing errors diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 536307b8a..b0038249f 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -515,7 +515,7 @@ public final class plasmaParser { // testing if the resource is not empty if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { - String errorMsg = "No resource content available."; + String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available."; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); } diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index c8de2b6af..0ca9bb8a1 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -42,8 +42,14 @@ package de.anomic.plasma; +import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; import java.net.MalformedURLException; +import de.anomic.server.serverFileUtils; + import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -62,7 +68,7 @@ public class plasmaParserDocument { String longTitle; // the real title of the document, commonly h1-tags String[] sections; // if present: more titles/headlines appearing in the document String abstrct; // an abstract, if present: short content description - byte[] text; // the clear text, all that is visible + private Object text; // the clear text, all that is visible Map anchors; // all links embedded as clickeable entities (anchor tags) TreeSet images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. @@ -95,6 +101,29 @@ public class plasmaParserDocument { this.condenser = null; this.resorted = false; } + + public plasmaParserDocument(URL location, String mimeType, String charset, + String[] keywords, String shortTitle, String longTitle, + String[] sections, String abstrct, + File text, Map anchors, TreeSet images) { + this.location = location; + this.mimeType = (mimeType==null)?"application/octet-stream":mimeType; + this.charset = charset; + this.keywords = (keywords==null) ? new String[0] : keywords; + this.shortTitle = (shortTitle==null)?"":shortTitle; + this.longTitle = (longTitle==null)?"":longTitle; + this.sections = (sections==null)?new String[0]:sections; + this.abstrct = (abstrct==null)?"":abstrct; + this.text = text; + if (text != null) text.deleteOnExit(); + this.anchors = (anchors==null)?new HashMap(0):anchors; + this.images = (images==null)?new TreeSet():images; + this.hyperlinks = null; + this.medialinks = null; + this.emaillinks = null; + this.condenser = null; + this.resorted = false; + } public String getMimeType() { return this.mimeType; @@ -103,7 +132,7 @@ public class plasmaParserDocument { /** * @return the supposed charset of this document or null if unknown */ - public String getCharset() { + public String getSourceCharset() { return this.charset; } @@ -123,13 +152,41 @@ public class plasmaParserDocument { if (abstrct != null) return abstrct; else return getMainLongTitle(); } - public byte[] getText() { - // returns only the clear (visible) text (not the source data) - return text; + public InputStream getText() { + try { + if (this.text == null) return null; + + if (this.text instanceof File) return new BufferedInputStream(new FileInputStream((File)this.text)); + else if (this.text instanceof byte[]) return new ByteArrayInputStream((byte[])this.text); + + } catch (Exception e) { + e.printStackTrace(); + } + return null; + } + + public byte[] getTextBytes() { + try { + if (this.text == null) return new byte[0]; + + if (this.text instanceof File) return serverFileUtils.read((File)this.text); + else if (this.text instanceof byte[]) return (byte[])this.text; + } catch (Exception e) { + e.printStackTrace(); + } + return new byte[0]; + } + + public long getTextLength() { + if (this.text == null) return 0; + if (this.text instanceof File) return ((File)this.text).length(); + else if (this.text instanceof byte[]) return ((byte[])this.text).length; + + return -1; } public plasmaCondenser getCondenser() { - if (condenser == null) condenser = new plasmaCondenser(new ByteArrayInputStream(getText()), 0, 0); + if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0); return condenser; } @@ -262,4 +319,16 @@ public class plasmaParserDocument { this.resorted = true; } + public void close() { + // delete the temp file + if ((this.text != null) && (this.text instanceof File)) { + try { ((File)this.text).delete(); } catch (Exception e) {/* ignore this */} + } + } + + protected void finalize() throws Throwable { + this.close(); + super.finalize(); + } + } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 4e0fba5ba..4d37c31b7 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1416,6 +1416,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException { + plasmaParserDocument document = null; try { // work off one stack entry with a fresh resource long stackStartTime = 0, stackEndTime = 0, @@ -1456,7 +1457,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser /* ========================================================================= * PARSE CONTENT * ========================================================================= */ - plasmaParserDocument document = null; parsingStartTime = System.currentTimeMillis(); try { @@ -1527,7 +1527,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); log.logFine("Condensing for '" + entry.normalizedURLString() + "'"); - plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText())); + plasmaCondenser condenser = new plasmaCondenser(document.getText()); // generate citation reference Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); @@ -1700,8 +1700,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logInfo("*Indexed " + words + " words in URL " + entry.url() + " [" + entry.urlHash() + "]" + "\n\tDescription: " + docDescription + - "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " + - "Size: " + document.text.length + " bytes | " + + "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " + + "Size: " + document.getTextLength() + " bytes | " + "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) + "\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " + "ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " + @@ -1744,6 +1744,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, ""); } } + document.close(); document = null; } catch (Exception e) { this.log.logSevere("Unexpected exception while parsing/indexing URL ",e); @@ -1772,6 +1773,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser cacheManager.deleteFile(entry.url()); } entry = null; + + if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ } } } @@ -1807,7 +1810,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) + // count of links to global resources kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) + // count of links to other documents - kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getText().length, 3) + // length of plain text in bytes + kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) + // length of plain text in bytes kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_SIMI_WORDS, 3) + // count of all unique words kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote) @@ -2173,7 +2176,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser try { // get set of words // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); - Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText()); + Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes()); // delete all word references int count = removeReferences(urlhash, witer); // finally delete the url entry itself