fix handling of encrypted PDF-Documents (with default user password "")

- update PDFBox package to current version 0.7.3
- use new security model in PDFBox to "guess" wether we can decrypt a document or not
NOTE: When upgrading to this version make sure the old PDFBox-0.7.2.jar is removed from libx/

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4161 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
fuchsi 18 years ago
parent b54fcd732b
commit e77aec8c9d

@ -16,7 +16,7 @@
<pathelement location="${build}" />
<!-- main lib needed to parse pdf files -->
<pathelement location="${libx}/PDFBox-0.7.2.jar" />
<pathelement location="${libx}/PDFBox-0.7.3.jar" />
</classpath>
</javac>
</target>
@ -25,7 +25,7 @@
<target name="zip" depends="compile">
<tar destfile="${parserArchive}" compression="gzip">
<tarfileset dir="${libx}"
includes="PDFBox-0.7.2.*"
includes="PDFBox-0.7.3.*"
prefix="${releaseFileParentDir}/libx/"
dirmode="755" mode="644"/>
<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}"
@ -39,7 +39,7 @@
<target name="copy" depends="compile">
<copy todir="${release}/libx/">
<fileset dir="${libx}" includes="PDFBox-0.7.2.*"/>
<fileset dir="${libx}" includes="PDFBox-0.7.3.*"/>
</copy>
<copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
<fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>

@ -53,6 +53,7 @@ import java.util.Hashtable;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFTextStripper;
import de.anomic.plasma.plasmaCrawlEURL;
@ -77,7 +78,7 @@ public class pdfParser extends AbstractParser implements Parser {
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"PDFBox-0.7.2.jar"
"PDFBox-0.7.3.jar"
};
public pdfParser() {
@ -119,7 +120,9 @@ public class pdfParser extends AbstractParser implements Parser {
theDocument = parser.getPDDocument();
if (theDocument.isEncrypted()) {
throw new ParserException("Document is encrypted",location,plasmaCrawlEURL.DENIED_DOCUMENT_ENCRYPTED);
theDocument.openProtection(new StandardDecryptionMaterial(""));
if (!theDocument.getCurrentAccessPermission().canExtractContent())
throw new ParserException("Document is encrypted",location,plasmaCrawlEURL.DENIED_DOCUMENT_ENCRYPTED);
}
// extracting some metadata

Loading…
Cancel
Save