added experimental pdf splitting which enables YaCy to split pdfs during

parsing into individual pages and add them all using different URLs. These constructed urls are generated from the source url with an appended page=<pagenumber> attribute to the url get/post properties. This will distinguish the different page entries. The search result list will then replace the post parameter with a url anchor # mark which causes that the original url is presented in the search result. These URLs can be opened directly on the correct page using pdf.js which is now built-in into firefox. That means: if you find a search hit on page 5 and click on the search result, firefox will open the pdf viewer and shows page 5.
10 years ago · 8c3e5b7b6d
parent 85773ebd4f
commit 8c3e5b7b6d
6 changed files with 144 additions and 69 deletions
--- a/source/net/yacy/crawler/retrieval/SitemapImporter.java
+++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java
@ -57,7 +57,7 @@ public class SitemapImporter extends Thread {
    @Override
    public void run() {
        try {
-            logger.info("Start parsing sitemap file " + this.siteMapURL);
+            logger.info("Start parsing sitemap file " + this.siteMapURL.toNormalform(true));
            sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL, this.crawlingProfile.getAgent());
            parser.start();
            URLEntry item;
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -36,6 +36,7 @@ import java.lang.reflect.Method;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.List;

 import org.apache.pdfbox.exceptions.CryptographyException;
@ -51,6 +52,7 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.util.PDFTextStripper;

+import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.util.ConcurrentLog;
@ -64,6 +66,9 @@ import net.yacy.kelondro.util.MemoryControl;

 public class pdfParser extends AbstractParser implements Parser {

+    public static boolean individualPages = false;
+    public static String individualPagePropertyname = "page";
+    
    public pdfParser() {
        super("Acrobat Portable Document Parser");
        this.SUPPORTED_EXTENSIONS.add("pdf");
@ -78,7 +83,7 @@ public class pdfParser extends AbstractParser implements Parser {
    static {
        clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // must be called here to get that into the class loader; it will block other threads otherwise;
    }
-    
+
    @Override
    public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {

@ -141,51 +146,117 @@ public class pdfParser extends AbstractParser implements Parser {
        if (docTitle == null || docTitle.isEmpty()) {
            docTitle = MultiProtocolURL.unescape(location.getFileName());
        }
-        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
-        byte[] contentBytes = new byte[0];
-        Collection<AnchorURL> pdflinks = null;
+        if (docTitle == null) {
+            docTitle = docSubject;
+        }
+        String[] docKeywords = null;
+        if (docKeywordStr != null) {
+            docKeywords = docKeywordStr.split(" |,");
+        }
+        
+        Collection<AnchorURL>[] pdflinks = null;
+        Document[] result = null;
        try {
-            // create a writer for output
-            final PDFTextStripper  stripper = new PDFTextStripper("UTF-8");
+            // get the links
+            pdflinks = extractPdfLinks(pdfDoc);
+            
+            // get the fulltext (either per document or for each page)
+            final PDFTextStripper stripper = new PDFTextStripper("UTF-8");
+
+            if (individualPages) {
+                // this is a hack which stores individual pages of the source pdf into individual index documents
+                // the new documents will get a virtual link with a post argument page=X appended to the original url
+                
+                // collect text
+                int pagecount = pdfDoc.getNumberOfPages();
+                String[] pages = new String[pagecount];
+                for (int page = 1; page <= pagecount; page++) {
+                    stripper.setStartPage(page);
+                    stripper.setEndPage(page);
+                    pages[page - 1] = stripper.getText(pdfDoc);
+                    System.out.println("PAGE " + page + ": " + pages[page - 1]);
+                }
+                
+                // create individual documents for each page
+                assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
+                result = new Document[Math.min(pages.length, pdflinks.length)];
+                String loc = location.toNormalform(true);
+                for (int page = 0; page < result.length; page++) {                    
+                    result[page] = new Document(
+                            new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
+                            mimeType,
+                            "UTF-8",
+                            this,
+                            null,
+                            docKeywords,
+                            singleList(docTitle),
+                            docAuthor,
+                            docPublisher,
+                            null,
+                            null,
+                            0.0f, 0.0f,
+                            pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
+                            pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
+                            null,
+                            null,
+                            false,
+                            docDate);
+                }
+            } else {
+                // collect the whole text at once
+                final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
+                byte[] contentBytes = new byte[0];
+                stripper.setEndPage(3); // get first 3 pages (always)
+                writer.append(stripper.getText(pdfDoc));
+                contentBytes = writer.getBytes(); // remember text in case of interrupting thread

-            stripper.setEndPage(3); // get first 3 pages (always)
-            writer.append(stripper.getText(pdfDoc));
-            contentBytes = writer.getBytes(); // remember text in case of interrupting thread
+                if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
+                    stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
+                    stripper.setEndPage(Integer.MAX_VALUE); // set to default
+                    // we start the pdf parsing in a separate thread to ensure that it can be terminated
+                    final PDDocument pdfDocC = pdfDoc;
+                    final Thread t = new Thread() {
+                        @Override
+                        public void run() {
+                            Thread.currentThread().setName("pdfParser.getText:" + location);
+                            try {
+                                writer.append(stripper.getText(pdfDocC));
+                            } catch (final Throwable e) {}
+                        }
+                    };
+                    t.start();
+                    t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
+                    if (t.isAlive()) t.interrupt();
+                }
+                contentBytes = writer.getBytes(); // get final text before closing writer

-            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
-                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
-                stripper.setEndPage(Integer.MAX_VALUE); // set to default
-                // we start the pdf parsing in a separate thread to ensure that it can be terminated
-                final PDDocument pdfDocC = pdfDoc;
-                final Thread t = new Thread() {
-                    @Override
-                    public void run() {
-                        Thread.currentThread().setName("pdfParser.getText:" + location);
-                        try {
-                            writer.append(stripper.getText(pdfDocC));
-                        } catch (final Throwable e) {}
-                    }
-                };
-                t.start();
-                t.join(3000);
-                if (t.isAlive()) t.interrupt();
-            }
-            contentBytes = writer.getBytes(); // get final text before closing writer
-            pdflinks = extractPdfLinks(pdfDoc);
+                Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
+                for (Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
+                result = new Document[]{new Document(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        this,
+                        null,
+                        docKeywords,
+                        singleList(docTitle),
+                        docAuthor,
+                        docPublisher,
+                        null,
+                        null,
+                        0.0f, 0.0f,
+                        contentBytes,
+                        pdflinksCombined,
+                        null,
+                        null,
+                        false,
+                        docDate)};
+            }         
        } catch (final Throwable e) {
            //close the writer (in finally)
            //throw new Parser.Failure(e.getMessage(), location);
        } finally {
            try {pdfDoc.close();} catch (final Throwable e) {}
-            writer.close();
-        }
-
-        String[] docKeywords = null;
-        if (docKeywordStr != null) {
-            docKeywords = docKeywordStr.split(" |,");
-        }
-        if (docTitle == null) {
-            docTitle = docSubject;
        }

        // clear resources in pdfbox. they say that is resolved but it's not. see:
@ -201,25 +272,7 @@ public class pdfParser extends AbstractParser implements Parser {
        pdfDoc = null;
        clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
        
-        return new Document[]{new Document(
-                location,
-                mimeType,
-                "UTF-8",
-                this,
-                null,
-                docKeywords,
-                singleList(docTitle),
-                docAuthor,
-                docPublisher,
-                null,
-                null,
-                0.0f, 0.0f,
-                contentBytes,
-                (pdflinks == null || pdflinks.isEmpty()) ? null : pdflinks,
-                null,
-                null,
-                false,
-                docDate)};
+        return result;
    }

    /**
@ -227,11 +280,14 @@ public class pdfParser extends AbstractParser implements Parser {
     * @param pdf the document to parse
     * @return all detected links
     */
-    private Collection<AnchorURL> extractPdfLinks(final PDDocument pdf) {
-        final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
+    private Collection<AnchorURL>[] extractPdfLinks(final PDDocument pdf) {
        @SuppressWarnings("unchecked")
        List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages();
+        @SuppressWarnings("unchecked")
+        Collection<AnchorURL>[] linkCollections = (Collection<AnchorURL>[]) new Collection<?>[allPages.size()];
+        int pagecount = 0;
        for (PDPage page : allPages) {
+            final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
            try {
                List<PDAnnotation> annotations = page.getAnnotations();
                if (annotations != null) {
@ -248,8 +304,9 @@ public class pdfParser extends AbstractParser implements Parser {
                    }
                }
            } catch (IOException ex) {}
+            linkCollections[pagecount++] = pdflinks;
        }
-        return pdflinks;
+        return linkCollections;
    }

    public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
--- a/source/net/yacy/peers/Protocol.java
+++ b/source/net/yacy/peers/Protocol.java
@ -392,14 +392,14 @@ public final class Protocol {
            parts.put("object", UTF8.StringBody("rwicount"));
            parts.put("ttl", UTF8.StringBody("0"));
            parts.put("env", UTF8.StringBody(""));
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "posting request to " + targetAddress);
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "posting request to " + targetAddress);
            final Post post = new Post(targetAddress, targetHash, "/yacy/query.html", parts, timeout);
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received CONTENT from requesting " + targetAddress + (post.result == null ? "NULL" : (": length = " + post.result.length)));
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received CONTENT from requesting " + targetAddress + (post.result == null ? "NULL" : (": length = " + post.result.length)));
            final Map<String, String> result = FileUtils.table(post.result);
            if (result == null || result.isEmpty()) return new long[] {-1, -1};
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESULT from requesting " + targetAddress + " : result = " + result.toString());
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESULT from requesting " + targetAddress + " : result = " + result.toString());
            final String resp = result.get("response");
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESPONSE from requesting " + targetAddress + " : response = " + resp);
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESPONSE from requesting " + targetAddress + " : response = " + resp);
            if (resp == null) return new long[] {-1, -1};
            String magic = result.get("magic");
            if (magic == null) magic = "0";
@ -409,7 +409,7 @@ public final class Protocol {
                return new long[] {-1, -1};
            }
        } catch (final Exception e ) {
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received EXCEPTION from requesting " + targetAddress + ": " + e.getMessage());
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received EXCEPTION from requesting " + targetAddress + ": " + e.getMessage());
            if (Network.log.isFine()) Network.log.fine("yacyClient.queryRWICount error:" + e.getMessage());
            return new long[] {-1, -1};
        }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -833,6 +833,8 @@ public final class Switchboard extends serverSwitch {
                
    	TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
        TextParser.setDenyExtension(getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
+        pdfParser.individualPages = getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
+        pdfParser.individualPagePropertyname = getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");

        // start a loader
        this.log.config("Starting Crawl Loader");
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@ -284,6 +284,8 @@ public final class SwitchboardConstants {
    public static final String INDEX_TRANSFER_GZIP_BODY         = "indexTransfer.gzipBody";
    public static final String PARSER_MIME_DENY                 = "parser.mime.deny";
    public static final String PARSER_EXTENSIONS_DENY           = "parser.extensions.deny";
+    public static final String PARSER_PDF_INDIVIDUALPAGES       = "parser.pdf.individualpages";
+    public static final String PARSER_PDF_INDIVIDUALPAGES_KEY   = "parser.pdf.individualpages.key";
    /**
     * <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
     * <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>
--- a/source/net/yacy/search/snippet/ResultEntry.java
+++ b/source/net/yacy/search/snippet/ResultEntry.java
@ -27,6 +27,7 @@
 package net.yacy.search.snippet;

 import java.io.IOException;
+import java.net.MalformedURLException;
 import java.util.Comparator;
 import java.util.Date;

@ -36,6 +37,7 @@ import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.util.ByteArray;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.Condenser;
+import net.yacy.document.parser.pdfParser;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.data.word.WordReference;
@ -126,16 +128,28 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
        return this.urlentry.flags();
    }
    public String urlstring() {
-        return (this.alternative_urlstring == null) ? this.urlentry.url().toNormalform(true) : this.alternative_urlstring;
+        if (this.alternative_urlstring != null) return this.alternative_urlstring;
+        
+        if (!pdfParser.individualPages) return this.url().toNormalform(true);
+        if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.urlentry.url().getFileName()).toLowerCase())) return this.url().toNormalform(true);
+        // for pdf links we rewrite the url
+        // this is a special treatment of pdf files which can be splitted into subpages
+        String pageprop = pdfParser.individualPagePropertyname;
+        String resultUrlstring = this.urlentry.url().toNormalform(true);
+        int p = resultUrlstring.lastIndexOf(pageprop + "=");
+        if (p > 0) {
+          return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
+        }
+        return resultUrlstring;
    }
    public String urlname() {
-        return (this.alternative_urlname == null) ? MultiProtocolURL.unescape(this.urlentry.url().toNormalform(true)) : this.alternative_urlname;
+        return (this.alternative_urlname == null) ? MultiProtocolURL.unescape(urlstring()) : this.alternative_urlname;
    }
    public String title() {
        String titlestr = this.urlentry.dc_title();
        // if title is empty use filename as title
        if (titlestr.isEmpty()) { // if url has no filename, title is still empty (e.g. "www.host.com/" )
-            titlestr = this.urlentry.url() != null ? this.urlentry.url().getFileName() : "";
+            titlestr = this.url() != null ? this.url().getFileName() : "";
        }
        return titlestr;
    }