*) Better handling of large files during parsing

Extracted text of files that are larger than 5MB is stored in a temp file instead of keeping it in memory
*) plasmaParserDocument.java; getText now returnes an inputStream instead of a byte array
*) plasmaParserDocument.java: new function getTextBytes returns the parsed content as byte array
   Attention: the caller of this function has to ensure that enough memory is available to do this 
   to avoid OutOfMemory Exceptions
*) httpd.java: better error handling if the soaphander is not installed
*) pdfParser.java: 
   - better handling of documents with exotic charsets
   - better handling of large documents
   - better error logging of encrypted documents
*) rtfParser.java: Bugfix for UTF-8 support
*) tarParser.java: better handling of large documents
*) zipParser.java: better handling of large documents
*) plasmaCrawlEURL.java: new errorcode for encrypted documents
*) plasmaParserDocument.java: the extracted text can now be passed
   to this object as byte array or temp file   

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2679 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 8b2ceddb91
commit cd5f349666

@ -216,7 +216,7 @@ public class ViewFile {
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
String content = new String(document.getText());
String content = new String(document.getTextBytes());
content = wikiCode.replaceHTML(content); //added by Marc Nause
content = content.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");

@ -406,24 +406,24 @@ public final class httpd implements serverHandler {
/*
* Handling SOAP Requests here ...
*/
if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap")) {
if (soapHandler == null) {
if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap/")) {
if (this.soapHandler == null) {
try {
Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler");
Constructor classConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } );
soapHandler = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard });
this.soapHandler = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard });
} catch (Exception e) {
sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
} catch (NoClassDefFoundError e) {
sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",null);
sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
} catch (Error e) {
sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
}
}
soapHandler.doGet(this.prop, header, this.session.out);
this.soapHandler.doGet(this.prop, header, this.session.out);
/*
* Handling HTTP requests here ...
@ -541,7 +541,7 @@ public final class httpd implements serverHandler {
// we now know the HTTP version. depending on that, we read the header
httpHeader header;
String httpVersion = prop.getProperty("HTTP", "HTTP/0.9");
String httpVersion = this.prop.getProperty("HTTP", "HTTP/0.9");
if (httpVersion.equals("HTTP/0.9")) header = new httpHeader(reverseMappingCache);
else header = httpHeader.readHeader(this.prop,this.session);
@ -559,8 +559,8 @@ public final class httpd implements serverHandler {
/*
* Handling SOAP Requests here ...
*/
if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap")) {
if (soapHandler == null) {
if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap/")) {
if (this.soapHandler == null) {
try {
// creating the soap handler class by name
Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler");
@ -569,19 +569,19 @@ public final class httpd implements serverHandler {
Constructor soapHandlerConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } );
// creating the new object
soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } );
this.soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } );
} catch (Exception e) {
sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
} catch (NoClassDefFoundError e) {
sendRespondError(this.prop,this.session.out,4,503,"SOAP Extension not installed","SOAP Extension not installed",null);
sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
} catch (Error e) {
sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
return serverCore.TERMINATE_CONNECTION;
}
}
soapHandler.doPost(prop, header, this.session.out, this.session.in);
this.soapHandler.doPost(this.prop, header, this.session.out, this.session.in);
/*
* Handling normal HTTP requests here ...
*/
@ -981,7 +981,7 @@ public final class httpd implements serverHandler {
int httpStatusCode,
String httpStatusText,
String detailedErrorMsg,
Exception stackTrace
Throwable stackTrace
) throws IOException {
sendRespondError(
conProp,
@ -1004,7 +1004,7 @@ public final class httpd implements serverHandler {
String httpStatusText,
File detailedErrorMsgFile,
serverObjects detailedErrorMsgValues,
Exception stackTrace
Throwable stackTrace
) throws IOException {
sendRespondError(
conProp,
@ -1029,7 +1029,7 @@ public final class httpd implements serverHandler {
String detailedErrorMsgText,
Object detailedErrorMsgFile,
serverObjects detailedErrorMsgValues,
Exception stackTrace,
Throwable stackTrace,
httpHeader header
) throws IOException {

@ -64,7 +64,7 @@ import de.anomic.server.logging.serverLog;
* @version $LastChangedRevision$ / $LastChangedDate$
*/
public abstract class AbstractParser implements Parser{
/**
* a list of library names that are needed by this parser
*/
@ -86,6 +86,12 @@ public abstract class AbstractParser implements Parser{
*/
protected String parserName = this.getClass().getName();
/**
* The source file file size in bytes if the source document was passed
* in as file
*/
protected long fileSize = -1;
/**
* The Constructor of this class.
*/
@ -178,6 +184,9 @@ public abstract class AbstractParser implements Parser{
) throws ParserException, InterruptedException {
BufferedInputStream contentInputStream = null;
try {
// getting the file size of the document
this.fileSize = sourceFile.length();
// create a stream from the file
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));

@ -60,6 +60,9 @@ import de.anomic.server.logging.serverLog;
*/
public interface Parser {
public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024;
/**
* Parsing a document available as byte array
* @param location the origin of the document

@ -43,8 +43,11 @@
package de.anomic.plasma.parser.pdf;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Hashtable;
import org.pdfbox.pdfparser.PDFParser;
@ -53,11 +56,12 @@ import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCharBuffer;
public class pdfParser extends AbstractParser implements Parser {
@ -87,9 +91,9 @@ public class pdfParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
PDDocument theDocument = null;
OutputStreamWriter writer = null;
Writer writer = null;
File writerFile = null;
try {
// reducing thread priority
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
@ -114,6 +118,10 @@ public class pdfParser extends AbstractParser implements Parser {
PDFTextStripper stripper = new PDFTextStripper();
theDocument = parser.getPDDocument();
if (theDocument.isEncrypted()) {
throw new ParserException("Document is encrypted",location,plasmaCrawlEURL.DENIED_DOCUMENT_ENCRYPTED);
}
// extracting some metadata
PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
if (theDocInfo != null) {
@ -121,42 +129,54 @@ public class pdfParser extends AbstractParser implements Parser {
docSubject = theDocInfo.getSubject();
//docAuthor = theDocInfo.getAuthor();
docKeywordStr = theDocInfo.getKeywords();
}
serverByteBuffer out = new serverByteBuffer();
writer = new OutputStreamWriter( out );
stripper.writeText(theDocument, writer );
writer.close(); writer = null;
theDocument.close(); theDocument = null;
}
byte[] contents = out.toByteArray();
out.close();
out = null;
if ((docTitle == null) || (docTitle.length() == 0)) {
docTitle = ((contents.length > 80)? new String(contents, 0, 80, "UTF-8"):new String(contents, "UTF-8")).
replaceAll("\r\n"," ").
replaceAll("\n"," ").
replaceAll("\r"," ").
replaceAll("\t"," ");
// creating a writer for output
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("pdfParser",".tmp");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new serverCharBuffer();
}
stripper.writeText(theDocument, writer );
theDocument.close(); theDocument = null;
writer.close();
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
docKeywords,
docSubject,
docTitle,
null,
null,
contents,
null,
null);
plasmaParserDocument theDoc = null;
if (writer instanceof serverCharBuffer) {
byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
docKeywords,
docSubject,
docTitle,
null,
null,
contentBytes,
null,
null);
} else {
theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
docKeywords,
docSubject,
docTitle,
null,
null,
writerFile,
null,
null);
}
return theDoc;
}
@ -164,6 +184,12 @@ public class pdfParser extends AbstractParser implements Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
// close the writer
if (writer != null) try { writer.close(); } catch (Exception ex) {/* ignore this */}
// delete the file
if (writerFile != null) try { writerFile.delete(); } catch (Exception ex) {/* ignore this */}
throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location);
} finally {
if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */}
@ -173,8 +199,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
public void reset() {
// Nothing todo here at the moment
this.fileSize = -1;
}
}

@ -80,8 +80,7 @@ implements Parser {
this.parserName = "Rich Text Format Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
@ -105,7 +104,7 @@ implements Parser {
null,
null,
null,
bodyText.getBytes(),
bodyText.getBytes("UTF-8"),
null,
null);

@ -43,8 +43,11 @@
package de.anomic.plasma.parser.tar;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
@ -96,7 +99,18 @@ public class tarParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
long docTextLength = 0;
OutputStream docText = null;
File outputFile = null;
plasmaParserDocument subDoc = null;
try {
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".tmp");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
docText = new serverByteBuffer();
}
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
@ -116,7 +130,7 @@ public class tarParser extends AbstractParser implements Parser {
StringBuffer docLongTitle = new StringBuffer();
LinkedList docSections = new LinkedList();
StringBuffer docAbstrct = new StringBuffer();
serverByteBuffer docText = new serverByteBuffer();
Map docAnchors = new HashMap();
TreeSet docImages = new TreeSet();
@ -141,55 +155,58 @@ public class tarParser extends AbstractParser implements Parser {
String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
// getting the entry content
plasmaParserDocument theDoc = null;
File tempFile = null;
File subDocTempFile = null;
try {
// create the temp file
tempFile = createTempFile(entryName);
subDocTempFile = createTempFile(entryName);
// copy the data into the file
serverFileUtils.copy(tin,tempFile,entry.getSize());
serverFileUtils.copy(tin,subDocTempFile,entry.getSize());
// check for interruption
checkInterruption();
// parsing the content
theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile);
subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,subDocTempFile);
} catch (ParserException e) {
this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage());
} finally {
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */}
}
if (theDoc == null) continue;
if (subDoc == null) continue;
// merging all documents together
if (docKeywords.length() > 0) docKeywords.append(",");
docKeywords.append(theDoc.getKeywords(','));
docKeywords.append(subDoc.getKeywords(','));
if (docLongTitle.length() > 0) docLongTitle.append("\n");
docLongTitle.append(theDoc.getMainLongTitle());
docLongTitle.append(subDoc.getMainLongTitle());
if (docShortTitle.length() > 0) docShortTitle.append("\n");
docShortTitle.append(theDoc.getMainShortTitle());
docShortTitle.append(subDoc.getMainShortTitle());
docSections.addAll(Arrays.asList(theDoc.getSectionTitles()));
docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
if (docAbstrct.length() > 0) docAbstrct.append("\n");
docAbstrct.append(theDoc.getAbstract());
docAbstrct.append(subDoc.getAbstract());
if (docText.length() > 0) docText.append("\n");
docText.append(theDoc.getText());
if (subDoc.getTextLength() > 0) {
if (docTextLength > 0) docText.write('\n');
docTextLength += serverFileUtils.copy(subDoc.getText(), docText);
}
docAnchors.putAll(subDoc.getAnchors());
docImages.addAll(subDoc.getImages());
docAnchors.putAll(theDoc.getAnchors());
docImages.addAll(theDoc.getImages());
// release subdocument
subDoc.close();
subDoc = null;
}
/* (URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images)
*/
return new plasmaParserDocument(
plasmaParserDocument result = null;
if (docText instanceof serverByteBuffer) {
result = new plasmaParserDocument(
location,
mimeType,
null,
@ -198,13 +215,37 @@ public class tarParser extends AbstractParser implements Parser {
docLongTitle.toString(),
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
docText.toByteArray(),
((serverByteBuffer)docText).toByteArray(),
docAnchors,
docImages);
} else {
result = new plasmaParserDocument(
location,
mimeType,
null,
docKeywords.toString().split(" |,"),
docShortTitle.toString(),
docLongTitle.toString(),
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
outputFile,
docAnchors,
docImages);
}
return result;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
if (subDoc != null) subDoc.close();
// close the writer
if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */}
// delete the file
if (outputFile != null) try { outputFile.delete(); } catch (Exception ex) {/* ignore this */}
throw new ParserException("Unexpected error while parsing tar resource. " + e.getMessage(),location);
}
}

@ -43,8 +43,11 @@
package de.anomic.plasma.parser.zip;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
@ -94,13 +97,23 @@ public class zipParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
long docTextLength = 0;
OutputStream docText = null;
File outputFile = null;
plasmaParserDocument subDoc = null;
try {
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".tmp");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
docText = new serverByteBuffer();
}
StringBuffer docKeywords = new StringBuffer();
StringBuffer docShortTitle = new StringBuffer();
StringBuffer docLongTitle = new StringBuffer();
LinkedList docSections = new LinkedList();
StringBuffer docAbstrct = new StringBuffer();
serverByteBuffer docText = new serverByteBuffer();
Map docAnchors = new HashMap();
TreeSet docImages = new TreeSet();
@ -128,48 +141,56 @@ public class zipParser extends AbstractParser implements Parser {
String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
// parsing the content
plasmaParserDocument theDoc = null;
File tempFile = null;
File subDocTempFile = null;
try {
// create the temp file
tempFile = createTempFile(entryName);
subDocTempFile = createTempFile(entryName);
// copy the data into the file
serverFileUtils.copy(zippedContent,tempFile,entry.getSize());
serverFileUtils.copy(zippedContent,subDocTempFile,entry.getSize());
// parsing the zip file entry
theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile);
subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, subDocTempFile);
} catch (ParserException e) {
this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage());
} finally {
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */}
}
if (theDoc == null) continue;
if (subDoc == null) continue;
// merging all documents together
if (docKeywords.length() > 0) docKeywords.append(",");
docKeywords.append(theDoc.getKeywords(','));
docKeywords.append(subDoc.getKeywords(','));
if (docLongTitle.length() > 0) docLongTitle.append("\n");
docLongTitle.append(theDoc.getMainLongTitle());
docLongTitle.append(subDoc.getMainLongTitle());
if (docShortTitle.length() > 0) docShortTitle.append("\n");
docShortTitle.append(theDoc.getMainShortTitle());
docShortTitle.append(subDoc.getMainShortTitle());
docSections.addAll(Arrays.asList(theDoc.getSectionTitles()));
docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
if (docAbstrct.length() > 0) docAbstrct.append("\n");
docAbstrct.append(theDoc.getAbstract());
docAbstrct.append(subDoc.getAbstract());
if (docText.length() > 0) docText.append("\n");
docText.append(theDoc.getText());
if (subDoc.getTextLength() > 0) {
if (docTextLength > 0) docText.write('\n');
docTextLength += serverFileUtils.copy(subDoc.getText(), docText);
}
docAnchors.putAll(theDoc.getAnchors());
docImages.addAll(theDoc.getImages());
docAnchors.putAll(subDoc.getAnchors());
docImages.addAll(subDoc.getImages());
// release subdocument
subDoc.close();
subDoc = null;
}
return new plasmaParserDocument(
plasmaParserDocument result = null;
if (docText instanceof serverByteBuffer) {
result = new plasmaParserDocument(
location,
mimeType,
null,
@ -178,13 +199,37 @@ public class zipParser extends AbstractParser implements Parser {
docLongTitle.toString(),
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
docText.toByteArray(),
((serverByteBuffer)docText).toByteArray(),
docAnchors,
docImages);
} else {
result = new plasmaParserDocument(
location,
mimeType,
null,
docKeywords.toString().split(" |,"),
docShortTitle.toString(),
docLongTitle.toString(),
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
outputFile,
docAnchors,
docImages);
}
return result;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
if (subDoc != null) subDoc.close();
// close the writer
if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */}
// delete the file
if (outputFile != null) try { outputFile.delete(); } catch (Exception ex) {/* ignore this */}
throw new ParserException("Unexpected error while parsing zip resource. " + e.getClass().getName() + ": "+ e.getMessage(),location);
}
}

@ -117,6 +117,7 @@ public class plasmaCrawlEURL extends indexURL {
// Parser errors
public static final String DENIED_PARSER_ERROR = "denied_(parser_error)";
public static final String DENIED_DOCUMENT_ENCRYPTED = "denied_(document_encrypted)";
public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)";
// indexing errors

@ -515,7 +515,7 @@ public final class plasmaParser {
// testing if the resource is not empty
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
String errorMsg = "No resource content available.";
String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
}

@ -42,8 +42,14 @@
package de.anomic.plasma;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.server.serverFileUtils;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@ -62,7 +68,7 @@ public class plasmaParserDocument {
String longTitle; // the real title of the document, commonly h1-tags
String[] sections; // if present: more titles/headlines appearing in the document
String abstrct; // an abstract, if present: short content description
byte[] text; // the clear text, all that is visible
private Object text; // the clear text, all that is visible
Map anchors; // all links embedded as clickeable entities (anchor tags)
TreeSet images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
@ -95,6 +101,29 @@ public class plasmaParserDocument {
this.condenser = null;
this.resorted = false;
}
public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
File text, Map anchors, TreeSet images) {
this.location = location;
this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
this.charset = charset;
this.keywords = (keywords==null) ? new String[0] : keywords;
this.shortTitle = (shortTitle==null)?"":shortTitle;
this.longTitle = (longTitle==null)?"":longTitle;
this.sections = (sections==null)?new String[0]:sections;
this.abstrct = (abstrct==null)?"":abstrct;
this.text = text;
if (text != null) text.deleteOnExit();
this.anchors = (anchors==null)?new HashMap(0):anchors;
this.images = (images==null)?new TreeSet():images;
this.hyperlinks = null;
this.medialinks = null;
this.emaillinks = null;
this.condenser = null;
this.resorted = false;
}
public String getMimeType() {
return this.mimeType;
@ -103,7 +132,7 @@ public class plasmaParserDocument {
/**
* @return the supposed charset of this document or <code>null</code> if unknown
*/
public String getCharset() {
public String getSourceCharset() {
return this.charset;
}
@ -123,13 +152,41 @@ public class plasmaParserDocument {
if (abstrct != null) return abstrct; else return getMainLongTitle();
}
public byte[] getText() {
// returns only the clear (visible) text (not the source data)
return text;
public InputStream getText() {
try {
if (this.text == null) return null;
if (this.text instanceof File) return new BufferedInputStream(new FileInputStream((File)this.text));
else if (this.text instanceof byte[]) return new ByteArrayInputStream((byte[])this.text);
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public byte[] getTextBytes() {
try {
if (this.text == null) return new byte[0];
if (this.text instanceof File) return serverFileUtils.read((File)this.text);
else if (this.text instanceof byte[]) return (byte[])this.text;
} catch (Exception e) {
e.printStackTrace();
}
return new byte[0];
}
public long getTextLength() {
if (this.text == null) return 0;
if (this.text instanceof File) return ((File)this.text).length();
else if (this.text instanceof byte[]) return ((byte[])this.text).length;
return -1;
}
public plasmaCondenser getCondenser() {
if (condenser == null) condenser = new plasmaCondenser(new ByteArrayInputStream(getText()), 0, 0);
if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
return condenser;
}
@ -262,4 +319,16 @@ public class plasmaParserDocument {
this.resorted = true;
}
public void close() {
// delete the temp file
if ((this.text != null) && (this.text instanceof File)) {
try { ((File)this.text).delete(); } catch (Exception e) {/* ignore this */}
}
}
protected void finalize() throws Throwable {
this.close();
super.finalize();
}
}

@ -1416,6 +1416,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
plasmaParserDocument document = null;
try {
// work off one stack entry with a fresh resource
long stackStartTime = 0, stackEndTime = 0,
@ -1456,7 +1457,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* =========================================================================
* PARSE CONTENT
* ========================================================================= */
plasmaParserDocument document = null;
parsingStartTime = System.currentTimeMillis();
try {
@ -1527,7 +1527,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
plasmaCondenser condenser = new plasmaCondenser(document.getText());
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser);
@ -1700,8 +1700,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" +
"\n\tDescription: " + docDescription +
"\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.text.length + " bytes | " +
"\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
"\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " +
"ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " +
@ -1744,6 +1744,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
}
}
document.close();
document = null;
} catch (Exception e) {
this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
@ -1772,6 +1773,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
cacheManager.deleteFile(entry.url());
}
entry = null;
if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ }
}
}
@ -1807,7 +1810,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) + // count of links to global resources
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) + // count of links to other documents
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getText().length, 3) + // length of plain text in bytes
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) + // length of plain text in bytes
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_SIMI_WORDS, 3) + // count of all unique words
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
@ -2173,7 +2176,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
try {
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes());
// delete all word references
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself

Loading…
Cancel
Save