From 24b0fa2a387e10a19cf5bc67258192e1f1350708 Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 16 May 2016 02:13:33 +0200 Subject: [PATCH] extend snapshot Html2Image.pdf2image to use PDFBox image export capability if no external tool installed (and for Win) Resulting jpg are not always perfect (if graphic included) but imho sufficient. --- source/net/yacy/cora/util/Html2Image.java | 53 +++++++++++++------ .../net/yacy/document/parser/pdfParser.java | 4 +- .../net/yacy/cora/util/Html2ImageTest.java | 37 +++++++++++++ 3 files changed, 76 insertions(+), 18 deletions(-) create mode 100644 test/java/net/yacy/cora/util/Html2ImageTest.java diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java index b55ac845a..281f5c813 100644 --- a/source/net/yacy/cora/util/Html2Image.java +++ b/source/net/yacy/cora/util/Html2Image.java @@ -20,6 +20,18 @@ package net.yacy.cora.util; +import java.awt.Container; +import java.awt.Dimension; +import java.awt.Graphics; +import java.awt.Image; +import java.awt.MediaTracker; +import java.awt.image.BufferedImage; +import java.beans.PropertyChangeEvent; +import java.beans.PropertyChangeListener; +import java.io.File; +import java.io.IOException; +import java.util.List; + import javax.imageio.ImageIO; import javax.swing.JEditorPane; import javax.swing.text.Document; @@ -34,18 +46,13 @@ import net.yacy.document.ImageParser; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.OS; -import java.awt.Container; -import java.awt.Dimension; -import java.awt.Graphics; -import java.awt.Image; -import java.awt.MediaTracker; -import java.awt.image.BufferedImage; -import java.beans.PropertyChangeEvent; -import java.beans.PropertyChangeListener; -import java.io.File; -import java.io.IOException; -import java.util.List; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +/** + * Convert html to an copy on disk-image in a other file format + * currently (pdf and/or jpg) + */ public class Html2Image { // Mac @@ -132,18 +139,32 @@ public class Html2Image { } /** - * convert a pdf to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75 - * @param pdf - * @param image + * convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75 + * using internal pdf library or external command line tool on linux or mac + * @param pdf input pdf file + * @param image output jpg file * @param width * @param height - * @param density + * @param density (dpi) * @param quality * @return */ public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) { final File convert = convertMac1.exists() ? convertMac1 : convertMac2.exists() ? convertMac2 : convertDebian; - + + // convert pdf to jpg using internal pdfbox capability + if (OS.isWindows || !convert.exists()) { + try { + PDDocument pdoc = PDDocument.load(pdf); + PDPage page = (PDPage) pdoc.getDocumentCatalog().getAllPages().get(0); + BufferedImage bi = page.convertToImage(BufferedImage.TYPE_INT_RGB, density); + + return ImageIO.write(bi, "jpg", image); + + } catch (IOException ex) { } + } + + // convert on mac or linux using external command line utility try { // i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg // note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 9291bdb25..bb8ef3a3b 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -41,6 +41,7 @@ import java.util.HashSet; import java.util.List; import org.apache.pdfbox.exceptions.CryptographyException; +import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDPage; @@ -65,7 +66,6 @@ import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; -import org.apache.pdfbox.pdfparser.PDFParser; public class pdfParser extends AbstractParser implements Parser { @@ -204,7 +204,7 @@ public class pdfParser extends AbstractParser implements Parser { docPublisher, null, null, - 0.0f, 0.0f, + 0.0d, 0.0d, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, diff --git a/test/java/net/yacy/cora/util/Html2ImageTest.java b/test/java/net/yacy/cora/util/Html2ImageTest.java new file mode 100644 index 000000000..dfb010b91 --- /dev/null +++ b/test/java/net/yacy/cora/util/Html2ImageTest.java @@ -0,0 +1,37 @@ +package net.yacy.cora.util; + +import java.io.File; +import java.io.FilenameFilter; +import java.util.ArrayList; +import java.util.List; +import net.yacy.utils.translation.ExtensionsFileFilter; +import org.junit.Test; +import static org.junit.Assert.*; + + +public class Html2ImageTest { + + /** + * Test of pdf2image method, of class Html2Image. + */ + @Test + public void testPdf2image() { + // collect pdf filenames in test directory + File pd = new File("test/parsertest"); + List extensions = new ArrayList(); + extensions.add("pdf"); + FilenameFilter fileFilter = new ExtensionsFileFilter(extensions); + String[] pdffiles = pd.list(fileFilter); + + for (String pdffilename : pdffiles) { + File pdffile = new File(pd, pdffilename); + File jpgfile = new File("test/DATA", pdffilename + ".jpg"); + if (jpgfile.exists()) { + jpgfile.delete(); + } + Html2Image.pdf2image(pdffile, jpgfile, 1024, 1024, 300, 75); + assertTrue(jpgfile.exists()); + } + } + +}