From 24b0fa2a387e10a19cf5bc67258192e1f1350708 Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Mon, 16 May 2016 02:13:33 +0200
Subject: [PATCH] extend snapshot Html2Image.pdf2image to use PDFBox image
 export capability if no external tool installed (and for Win) Resulting jpg
 are not always perfect (if graphic included) but imho sufficient.

---
 source/net/yacy/cora/util/Html2Image.java     | 53 +++++++++++++------
 .../net/yacy/document/parser/pdfParser.java   |  4 +-
 .../net/yacy/cora/util/Html2ImageTest.java    | 37 +++++++++++++
 3 files changed, 76 insertions(+), 18 deletions(-)
 create mode 100644 test/java/net/yacy/cora/util/Html2ImageTest.java

diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java
index b55ac845a..281f5c813 100644
--- a/source/net/yacy/cora/util/Html2Image.java
+++ b/source/net/yacy/cora/util/Html2Image.java
@@ -20,6 +20,18 @@
 
 package net.yacy.cora.util;
 
+import java.awt.Container;
+import java.awt.Dimension;
+import java.awt.Graphics;
+import java.awt.Image;
+import java.awt.MediaTracker;
+import java.awt.image.BufferedImage;
+import java.beans.PropertyChangeEvent;
+import java.beans.PropertyChangeListener;
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
 import javax.imageio.ImageIO;
 import javax.swing.JEditorPane;
 import javax.swing.text.Document;
@@ -34,18 +46,13 @@ import net.yacy.document.ImageParser;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.OS;
 
-import java.awt.Container;
-import java.awt.Dimension;
-import java.awt.Graphics;
-import java.awt.Image;
-import java.awt.MediaTracker;
-import java.awt.image.BufferedImage;
-import java.beans.PropertyChangeEvent;
-import java.beans.PropertyChangeListener;
-import java.io.File;
-import java.io.IOException;
-import java.util.List;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
 
+/**
+ * Convert html to an copy on disk-image in a other file format
+ * currently (pdf and/or jpg)
+ */
 public class Html2Image {
     
     // Mac
@@ -132,18 +139,32 @@ public class Html2Image {
     }
     
     /**
-     * convert a pdf to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
-     * @param pdf
-     * @param image
+     * convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
+     * using internal pdf library or external command line tool on linux or mac
+     * @param pdf input pdf file
+     * @param image output jpg file
      * @param width
      * @param height
-     * @param density
+     * @param density (dpi)
      * @param quality
      * @return
      */
     public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
         final File convert = convertMac1.exists() ? convertMac1 : convertMac2.exists() ? convertMac2 : convertDebian;
-        
+
+        // convert pdf to jpg using internal pdfbox capability
+        if (OS.isWindows || !convert.exists()) {
+            try {
+                PDDocument pdoc = PDDocument.load(pdf);
+                PDPage page = (PDPage) pdoc.getDocumentCatalog().getAllPages().get(0);
+                BufferedImage bi = page.convertToImage(BufferedImage.TYPE_INT_RGB, density);
+
+                return ImageIO.write(bi, "jpg", image);
+
+            } catch (IOException ex) { }
+        }
+
+        // convert on mac or linux using external command line utility
         try {
             // i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
             // note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index 9291bdb25..bb8ef3a3b 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -41,6 +41,7 @@ import java.util.HashSet;
 import java.util.List;
 
 import org.apache.pdfbox.exceptions.CryptographyException;
+import org.apache.pdfbox.pdfparser.PDFParser;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.PDPage;
@@ -65,7 +66,6 @@ import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.MemoryControl;
-import org.apache.pdfbox.pdfparser.PDFParser;
 
 
 public class pdfParser extends AbstractParser implements Parser {
@@ -204,7 +204,7 @@ public class pdfParser extends AbstractParser implements Parser {
                             docPublisher,
                             null,
                             null,
-                            0.0f, 0.0f,
+                            0.0d, 0.0d,
                             pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                             pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                             null,
diff --git a/test/java/net/yacy/cora/util/Html2ImageTest.java b/test/java/net/yacy/cora/util/Html2ImageTest.java
new file mode 100644
index 000000000..dfb010b91
--- /dev/null
+++ b/test/java/net/yacy/cora/util/Html2ImageTest.java
@@ -0,0 +1,37 @@
+package net.yacy.cora.util;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.ArrayList;
+import java.util.List;
+import net.yacy.utils.translation.ExtensionsFileFilter;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+
+public class Html2ImageTest {
+
+    /**
+     * Test of pdf2image method, of class Html2Image.
+     */
+    @Test
+    public void testPdf2image() {
+        // collect pdf filenames in test directory
+        File pd = new File("test/parsertest");
+        List<String> extensions = new ArrayList();
+        extensions.add("pdf");
+        FilenameFilter fileFilter = new ExtensionsFileFilter(extensions);
+        String[] pdffiles = pd.list(fileFilter);
+
+        for (String pdffilename : pdffiles) {
+            File pdffile = new File(pd, pdffilename);
+            File jpgfile = new File("test/DATA", pdffilename + ".jpg");
+            if (jpgfile.exists()) {
+                jpgfile.delete();
+            }
+            Html2Image.pdf2image(pdffile, jpgfile, 1024, 1024, 300, 75);
+            assertTrue(jpgfile.exists());
+        }
+    }
+
+}