extend snapshot Html2Image.pdf2image to use PDFBox image export capability

if no external tool installed (and for Win)
Resulting jpg are not always perfect (if graphic included) but imho sufficient.
pull/51/head
reger 9 years ago
parent eb2a00b1d8
commit 24b0fa2a38

@ -20,6 +20,18 @@
package net.yacy.cora.util;
import java.awt.Container;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.image.BufferedImage;
import java.beans.PropertyChangeEvent;
import java.beans.PropertyChangeListener;
import java.io.File;
import java.io.IOException;
import java.util.List;
import javax.imageio.ImageIO;
import javax.swing.JEditorPane;
import javax.swing.text.Document;
@ -34,18 +46,13 @@ import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
import java.awt.Container;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.image.BufferedImage;
import java.beans.PropertyChangeEvent;
import java.beans.PropertyChangeListener;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
/**
* Convert html to an copy on disk-image in a other file format
* currently (pdf and/or jpg)
*/
public class Html2Image {
// Mac
@ -132,18 +139,32 @@ public class Html2Image {
}
/**
* convert a pdf to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
* @param pdf
* @param image
* convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
* using internal pdf library or external command line tool on linux or mac
* @param pdf input pdf file
* @param image output jpg file
* @param width
* @param height
* @param density
* @param density (dpi)
* @param quality
* @return
*/
public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
final File convert = convertMac1.exists() ? convertMac1 : convertMac2.exists() ? convertMac2 : convertDebian;
// convert pdf to jpg using internal pdfbox capability
if (OS.isWindows || !convert.exists()) {
try {
PDDocument pdoc = PDDocument.load(pdf);
PDPage page = (PDPage) pdoc.getDocumentCatalog().getAllPages().get(0);
BufferedImage bi = page.convertToImage(BufferedImage.TYPE_INT_RGB, density);
return ImageIO.write(bi, "jpg", image);
} catch (IOException ex) { }
}
// convert on mac or linux using external command line utility
try {
// i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
// note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf

@ -41,6 +41,7 @@ import java.util.HashSet;
import java.util.List;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
@ -65,7 +66,6 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import org.apache.pdfbox.pdfparser.PDFParser;
public class pdfParser extends AbstractParser implements Parser {
@ -204,7 +204,7 @@ public class pdfParser extends AbstractParser implements Parser {
docPublisher,
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
null,

@ -0,0 +1,37 @@
package net.yacy.cora.util;
import java.io.File;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.List;
import net.yacy.utils.translation.ExtensionsFileFilter;
import org.junit.Test;
import static org.junit.Assert.*;
public class Html2ImageTest {
/**
* Test of pdf2image method, of class Html2Image.
*/
@Test
public void testPdf2image() {
// collect pdf filenames in test directory
File pd = new File("test/parsertest");
List<String> extensions = new ArrayList();
extensions.add("pdf");
FilenameFilter fileFilter = new ExtensionsFileFilter(extensions);
String[] pdffiles = pd.list(fileFilter);
for (String pdffilename : pdffiles) {
File pdffile = new File(pd, pdffilename);
File jpgfile = new File("test/DATA", pdffilename + ".jpg");
if (jpgfile.exists()) {
jpgfile.delete();
}
Html2Image.pdf2image(pdffile, jpgfile, 1024, 1024, 300, 75);
assertTrue(jpgfile.exists());
}
}
}
Loading…
Cancel
Save