@ -39,8 +39,10 @@ import java.util.Date;
import java.util.HashSet ;
import java.util.List ;
import org.apache.pdfbox.Loader ;
import org.apache.pdfbox.cos.COSName ;
import org.apache.pdfbox.io.MemoryUsageSetting ;
import org.apache.pdfbox.io.RandomAccessRead ;
import org.apache.pdfbox.io.RandomAccessReadBuffer ;
import org.apache.pdfbox.pdmodel.PDDocument ;
import org.apache.pdfbox.pdmodel.PDDocumentInformation ;
import org.apache.pdfbox.pdmodel.PDPage ;
@ -98,8 +100,8 @@ public class pdfParser extends AbstractParser implements Parser {
PDDocument pdfDoc ;
try {
Thread . currentThread ( ) . setPriority ( Thread . MIN_PRIORITY ) ; // the pdfparser is a big pain
MemoryUsageSetting mus = MemoryUsageSetting . setupMixed ( 200 * 1024 * 1024 ) ;
pdfDoc = PDDocument. load ( source , mus ) ;
final RandomAccessRead readBuffer = new RandomAccessReadBuffer ( source ) ;
pdfDoc = Loader. loadPDF ( readBuffer ) ;
} catch ( final IOException e ) {
throw new Parser . Failure ( e . getMessage ( ) , location ) ;
} finally {
@ -155,8 +157,8 @@ public class pdfParser extends AbstractParser implements Parser {
// the new documents will get a virtual link with a post argument page=X appended to the original url
// collect text
int pagecount = pdfDoc . getNumberOfPages ( ) ;
String [ ] pages = new String [ pagecount ] ;
final int pagecount = pdfDoc . getNumberOfPages ( ) ;
final String [ ] pages = new String [ pagecount ] ;
for ( int page = 1 ; page < = pagecount ; page + + ) {
stripper . setStartPage ( page ) ;
stripper . setEndPage ( page ) ;
@ -167,7 +169,7 @@ public class pdfParser extends AbstractParser implements Parser {
// create individual documents for each page
assert pages . length = = pdflinks . size ( ) : "pages.length = " + pages . length + ", pdflinks.length = " + pdflinks . size ( ) ;
result = new Document [ Math . min ( pages . length , pdflinks . size ( ) ) ] ;
String loc = location . toNormalform ( true ) ;
final String loc = location . toNormalform ( true ) ;
for ( int page = 0 ; page < result . length ; page + + ) {
result [ page ] = new Document (
new AnchorURL ( loc + ( loc . indexOf ( '?' ) > 0 ? '&' : '?' ) + individualPagePropertyname + '=' + ( page + 1 ) ) , // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
@ -217,8 +219,8 @@ public class pdfParser extends AbstractParser implements Parser {
writer . close ( ) ; // free writer resources
}
Collection < AnchorURL > pdflinksCombined = new HashSet < AnchorURL > ( ) ;
for ( Collection < AnchorURL > pdflinksx : pdflinks ) if ( pdflinksx ! = null ) pdflinksCombined . addAll ( pdflinksx ) ;
final Collection < AnchorURL > pdflinksCombined = new HashSet < > ( ) ;
for ( final Collection < AnchorURL > pdflinksx : pdflinks ) if ( pdflinksx ! = null ) pdflinksCombined . addAll ( pdflinksx ) ;
result = new Document [ ] { new Document (
location ,
mimeType ,
@ -258,25 +260,25 @@ public class pdfParser extends AbstractParser implements Parser {
* @return all detected links
* /
private List < Collection < AnchorURL > > extractPdfLinks ( final PDDocument pdf ) {
List < Collection < AnchorURL > > linkCollections = new ArrayList < > ( pdf . getNumberOfPages ( ) ) ;
for ( PDPage page : pdf . getPages ( ) ) {
final Collection < AnchorURL > pdflinks = new ArrayList < AnchorURL > ( ) ;
final List < Collection < AnchorURL > > linkCollections = new ArrayList < > ( pdf . getNumberOfPages ( ) ) ;
for ( final PDPage page : pdf . getPages ( ) ) {
final Collection < AnchorURL > pdflinks = new ArrayList < > ( ) ;
try {
List < PDAnnotation > annotations = page . getAnnotations ( ) ;
final List < PDAnnotation > annotations = page . getAnnotations ( ) ;
if ( annotations ! = null ) {
for ( PDAnnotation pdfannotation : annotations ) {
for ( final PDAnnotation pdfannotation : annotations ) {
if ( pdfannotation instanceof PDAnnotationLink ) {
PDAction link = ( ( PDAnnotationLink ) pdfannotation ) . getAction ( ) ;
final PDAction link = ( ( PDAnnotationLink ) pdfannotation ) . getAction ( ) ;
if ( link ! = null & & link instanceof PDActionURI ) {
PDActionURI pdflinkuri = ( PDActionURI ) link ;
String uristr = pdflinkuri . getURI ( ) ;
AnchorURL url = new AnchorURL ( uristr ) ;
final PDActionURI pdflinkuri = ( PDActionURI ) link ;
final String uristr = pdflinkuri . getURI ( ) ;
final AnchorURL url = new AnchorURL ( uristr ) ;
pdflinks . add ( url ) ;
}
}
}
}
} catch ( IOException ex ) { }
} catch ( final IOException ex ) { }
linkCollections . add ( pdflinks ) ;
}
return linkCollections ;
@ -345,7 +347,7 @@ public class pdfParser extends AbstractParser implements Parser {
if ( inStream ! = null ) {
try {
inStream . close ( ) ;
} catch ( IOException e ) {
} catch ( final IOException e ) {
System . err . println ( "Could not close input stream on file " + pdfFile ) ;
}
}
@ -359,7 +361,7 @@ public class pdfParser extends AbstractParser implements Parser {
System . out . println ( "\t!!!Parsing without result!!!" ) ;
} else {
System . out . println ( "\tParsed text with " + document . getTextLength ( ) + " chars of text and " + document . getAnchors ( ) . size ( ) + " anchors" ) ;
InputStream textStream = document . getTextStream ( ) ;
final InputStream textStream = document . getTextStream ( ) ;
try {
// write file
FileUtils . copy ( textStream , new File ( "parsedPdf.txt" ) ) ;
@ -372,7 +374,7 @@ public class pdfParser extends AbstractParser implements Parser {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream . close ( ) ;
}
} catch ( IOException e ) {
} catch ( final IOException e ) {
ConcurrentLog . warn ( "PDFPARSER" , "Could not close text input stream" ) ;
}
}