@ -43,8 +43,11 @@
package de.anomic.plasma.parser.tar ;
import java.io.BufferedOutputStream ;
import java.io.File ;
import java.io.FileOutputStream ;
import java.io.InputStream ;
import java.io.OutputStream ;
import java.util.Arrays ;
import java.util.HashMap ;
import java.util.Hashtable ;
@ -96,7 +99,18 @@ public class tarParser extends AbstractParser implements Parser {
public plasmaParserDocument parse ( URL location , String mimeType , String charset , InputStream source ) throws ParserException , InterruptedException {
long docTextLength = 0 ;
OutputStream docText = null ;
File outputFile = null ;
plasmaParserDocument subDoc = null ;
try {
if ( ( this . fileSize ! = - 1 ) & & ( this . fileSize > Parser . MAX_KEEP_IN_MEMORY_SIZE ) ) {
outputFile = File . createTempFile ( "zipParser" , ".tmp" ) ;
docText = new BufferedOutputStream ( new FileOutputStream ( outputFile ) ) ;
} else {
docText = new serverByteBuffer ( ) ;
}
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser ( ) ;
@ -116,7 +130,7 @@ public class tarParser extends AbstractParser implements Parser {
StringBuffer docLongTitle = new StringBuffer ( ) ;
LinkedList docSections = new LinkedList ( ) ;
StringBuffer docAbstrct = new StringBuffer ( ) ;
serverByteBuffer docText = new serverByteBuffer ( ) ;
Map docAnchors = new HashMap ( ) ;
TreeSet docImages = new TreeSet ( ) ;
@ -141,55 +155,58 @@ public class tarParser extends AbstractParser implements Parser {
String entryMime = plasmaParser . getMimeTypeByFileExt ( entryExt ) ;
// getting the entry content
plasmaParserDocument theDoc = null ;
File tempFile = null ;
File subDocTempFile = null ;
try {
// create the temp file
t empFile = createTempFile ( entryName ) ;
subDocT empFile = createTempFile ( entryName ) ;
// copy the data into the file
serverFileUtils . copy ( tin , t empFile, entry . getSize ( ) ) ;
serverFileUtils . copy ( tin , subDocT empFile, entry . getSize ( ) ) ;
// check for interruption
checkInterruption ( ) ;
// parsing the content
the Doc = theParser . parseSource ( new URL ( location , "#" + entryName ) , entryMime , null , t empFile) ;
sub Doc = theParser . parseSource ( new URL ( location , "#" + entryName ) , entryMime , null , subDocT empFile) ;
} catch ( ParserException e ) {
this . theLogger . logInfo ( "Unable to parse tar file entry '" + entryName + "'. " + e . getMessage ( ) ) ;
} finally {
if ( tempFile ! = null ) try { t empFile. delete ( ) ; } catch ( Exception ex ) { /* ignore this */ }
if ( subDocTempFile ! = null ) try { subDocT empFile. delete ( ) ; } catch ( Exception ex ) { /* ignore this */ }
}
if ( the Doc = = null ) continue ;
if ( sub Doc = = null ) continue ;
// merging all documents together
if ( docKeywords . length ( ) > 0 ) docKeywords . append ( "," ) ;
docKeywords . append ( the Doc. getKeywords ( ',' ) ) ;
docKeywords . append ( sub Doc. getKeywords ( ',' ) ) ;
if ( docLongTitle . length ( ) > 0 ) docLongTitle . append ( "\n" ) ;
docLongTitle . append ( the Doc. getMainLongTitle ( ) ) ;
docLongTitle . append ( sub Doc. getMainLongTitle ( ) ) ;
if ( docShortTitle . length ( ) > 0 ) docShortTitle . append ( "\n" ) ;
docShortTitle . append ( the Doc. getMainShortTitle ( ) ) ;
docShortTitle . append ( sub Doc. getMainShortTitle ( ) ) ;
docSections . addAll ( Arrays . asList ( the Doc. getSectionTitles ( ) ) ) ;
docSections . addAll ( Arrays . asList ( sub Doc. getSectionTitles ( ) ) ) ;
if ( docAbstrct . length ( ) > 0 ) docAbstrct . append ( "\n" ) ;
docAbstrct . append ( the Doc. getAbstract ( ) ) ;
docAbstrct . append ( sub Doc. getAbstract ( ) ) ;
if ( docText . length ( ) > 0 ) docText . append ( "\n" ) ;
docText . append ( theDoc . getText ( ) ) ;
if ( subDoc . getTextLength ( ) > 0 ) {
if ( docTextLength > 0 ) docText . write ( '\n' ) ;
docTextLength + = serverFileUtils . copy ( subDoc . getText ( ) , docText ) ;
}
docAnchors . putAll ( subDoc . getAnchors ( ) ) ;
docImages . addAll ( subDoc . getImages ( ) ) ;
docAnchors . putAll ( theDoc . getAnchors ( ) ) ;
docImages . addAll ( theDoc . getImages ( ) ) ;
// release subdocument
subDoc . close ( ) ;
subDoc = null ;
}
/ * ( URL location , String mimeType ,
String keywords , String shortTitle , String longTitle ,
String [ ] sections , String abstrct ,
byte [ ] text , Map anchors , Map images )
* /
return new plasmaParserDocument (
plasmaParserDocument result = null ;
if ( docText instanceof serverByteBuffer ) {
result = new plasmaParserDocument (
location ,
mimeType ,
null ,
@ -198,13 +215,37 @@ public class tarParser extends AbstractParser implements Parser {
docLongTitle . toString ( ) ,
( String [ ] ) docSections . toArray ( new String [ docSections . size ( ) ] ) ,
docAbstrct . toString ( ) ,
docText . toByteArray ( ) ,
( ( serverByteBuffer ) docText ) . toByteArray ( ) ,
docAnchors ,
docImages ) ;
} else {
result = new plasmaParserDocument (
location ,
mimeType ,
null ,
docKeywords . toString ( ) . split ( " |," ) ,
docShortTitle . toString ( ) ,
docLongTitle . toString ( ) ,
( String [ ] ) docSections . toArray ( new String [ docSections . size ( ) ] ) ,
docAbstrct . toString ( ) ,
outputFile ,
docAnchors ,
docImages ) ;
}
return result ;
} catch ( Exception e ) {
if ( e instanceof InterruptedException ) throw ( InterruptedException ) e ;
if ( e instanceof ParserException ) throw ( ParserException ) e ;
if ( subDoc ! = null ) subDoc . close ( ) ;
// close the writer
if ( docText ! = null ) try { docText . close ( ) ; } catch ( Exception ex ) { /* ignore this */ }
// delete the file
if ( outputFile ! = null ) try { outputFile . delete ( ) ; } catch ( Exception ex ) { /* ignore this */ }
throw new ParserException ( "Unexpected error while parsing tar resource. " + e . getMessage ( ) , location ) ;
}
}