From 8c3e5b7b6d091dda8e512cde50127c54070df5f3 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 21 Dec 2014 18:10:15 +0100
Subject: [PATCH] added experimental pdf splitting which enables YaCy to split
 pdfs during parsing into individual pages and add them all using different
 URLs. These constructed urls are generated from the source url with an
 appended page=<pagenumber> attribute to the url get/post properties. This
 will distinguish the different page entries. The search result list will then
 replace the post parameter with a url anchor # mark which causes that the
 original url is presented in the search result. These URLs can be opened
 directly on the correct page using pdf.js which is now built-in into firefox.
 That means: if you find a search hit on page 5 and click on the search
 result, firefox will open the pdf viewer and shows page 5.

---
 .../crawler/retrieval/SitemapImporter.java    |   2 +-
 .../net/yacy/document/parser/pdfParser.java   | 177 ++++++++++++------
 source/net/yacy/peers/Protocol.java           |  10 +-
 source/net/yacy/search/Switchboard.java       |   2 +
 .../net/yacy/search/SwitchboardConstants.java |   2 +
 .../net/yacy/search/snippet/ResultEntry.java  |  20 +-
 6 files changed, 144 insertions(+), 69 deletions(-)
diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java
index 936123360..240f8239d 100644
--- a/source/net/yacy/crawler/retrieval/SitemapImporter.java
+++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java
@@ -57,7 +57,7 @@ public class SitemapImporter extends Thread {
     @Override
     public void run() {
         try {
-            logger.info("Start parsing sitemap file " + this.siteMapURL);
+            logger.info("Start parsing sitemap file " + this.siteMapURL.toNormalform(true));
             sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL, this.crawlingProfile.getAgent());
             parser.start();
             URLEntry item;
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index 7b5daaba7..24af35411 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -36,6 +36,7 @@ import java.lang.reflect.Method;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.List;
 
 import org.apache.pdfbox.exceptions.CryptographyException;
@@ -51,6 +52,7 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.util.PDFTextStripper;
 
+import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.util.ConcurrentLog;
@@ -64,6 +66,9 @@ import net.yacy.kelondro.util.MemoryControl;
 
 public class pdfParser extends AbstractParser implements Parser {
 
+    public static boolean individualPages = false;
+    public static String individualPagePropertyname = "page";
+    
     public pdfParser() {
         super("Acrobat Portable Document Parser");
         this.SUPPORTED_EXTENSIONS.add("pdf");
@@ -78,7 +83,7 @@ public class pdfParser extends AbstractParser implements Parser {
     static {
         clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // must be called here to get that into the class loader; it will block other threads otherwise;
     }
-    
+
     @Override
     public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
 
@@ -141,51 +146,117 @@ public class pdfParser extends AbstractParser implements Parser {
         if (docTitle == null || docTitle.isEmpty()) {
             docTitle = MultiProtocolURL.unescape(location.getFileName());
         }
-        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
-        byte[] contentBytes = new byte[0];
-        Collection<AnchorURL> pdflinks = null;
+        if (docTitle == null) {
+            docTitle = docSubject;
+        }
+        String[] docKeywords = null;
+        if (docKeywordStr != null) {
+            docKeywords = docKeywordStr.split(" |,");
+        }
+        
+        Collection<AnchorURL>[] pdflinks = null;
+        Document[] result = null;
         try {
-            // create a writer for output
-            final PDFTextStripper  stripper = new PDFTextStripper("UTF-8");
+            // get the links
+            pdflinks = extractPdfLinks(pdfDoc);
+            
+            // get the fulltext (either per document or for each page)
+            final PDFTextStripper stripper = new PDFTextStripper("UTF-8");
+
+            if (individualPages) {
+                // this is a hack which stores individual pages of the source pdf into individual index documents
+                // the new documents will get a virtual link with a post argument page=X appended to the original url
+                
+                // collect text
+                int pagecount = pdfDoc.getNumberOfPages();
+                String[] pages = new String[pagecount];
+                for (int page = 1; page <= pagecount; page++) {
+                    stripper.setStartPage(page);
+                    stripper.setEndPage(page);
+                    pages[page - 1] = stripper.getText(pdfDoc);
+                    System.out.println("PAGE " + page + ": " + pages[page - 1]);
+                }
+                
+                // create individual documents for each page
+                assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
+                result = new Document[Math.min(pages.length, pdflinks.length)];
+                String loc = location.toNormalform(true);
+                for (int page = 0; page < result.length; page++) {                    
+                    result[page] = new Document(
+                            new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
+                            mimeType,
+                            "UTF-8",
+                            this,
+                            null,
+                            docKeywords,
+                            singleList(docTitle),
+                            docAuthor,
+                            docPublisher,
+                            null,
+                            null,
+                            0.0f, 0.0f,
+                            pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
+                            pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
+                            null,
+                            null,
+                            false,
+                            docDate);
+                }
+            } else {
+                // collect the whole text at once
+                final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
+                byte[] contentBytes = new byte[0];
+                stripper.setEndPage(3); // get first 3 pages (always)
+                writer.append(stripper.getText(pdfDoc));
+                contentBytes = writer.getBytes(); // remember text in case of interrupting thread
 
-            stripper.setEndPage(3); // get first 3 pages (always)
-            writer.append(stripper.getText(pdfDoc));
-            contentBytes = writer.getBytes(); // remember text in case of interrupting thread
+                if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
+                    stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
+                    stripper.setEndPage(Integer.MAX_VALUE); // set to default
+                    // we start the pdf parsing in a separate thread to ensure that it can be terminated
+                    final PDDocument pdfDocC = pdfDoc;
+                    final Thread t = new Thread() {
+                        @Override
+                        public void run() {
+                            Thread.currentThread().setName("pdfParser.getText:" + location);
+                            try {
+                                writer.append(stripper.getText(pdfDocC));
+                            } catch (final Throwable e) {}
+                        }
+                    };
+                    t.start();
+                    t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
+                    if (t.isAlive()) t.interrupt();
+                }
+                contentBytes = writer.getBytes(); // get final text before closing writer
 
-            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
-                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
-                stripper.setEndPage(Integer.MAX_VALUE); // set to default
-                // we start the pdf parsing in a separate thread to ensure that it can be terminated
-                final PDDocument pdfDocC = pdfDoc;
-                final Thread t = new Thread() {
-                    @Override
-                    public void run() {
-                        Thread.currentThread().setName("pdfParser.getText:" + location);
-                        try {
-                            writer.append(stripper.getText(pdfDocC));
-                        } catch (final Throwable e) {}
-                    }
-                };
-                t.start();
-                t.join(3000);
-                if (t.isAlive()) t.interrupt();
-            }
-            contentBytes = writer.getBytes(); // get final text before closing writer
-            pdflinks = extractPdfLinks(pdfDoc);
+                Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
+                for (Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
+                result = new Document[]{new Document(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        this,
+                        null,
+                        docKeywords,
+                        singleList(docTitle),
+                        docAuthor,
+                        docPublisher,
+                        null,
+                        null,
+                        0.0f, 0.0f,
+                        contentBytes,
+                        pdflinksCombined,
+                        null,
+                        null,
+                        false,
+                        docDate)};
+            }         
         } catch (final Throwable e) {
             //close the writer (in finally)
             //throw new Parser.Failure(e.getMessage(), location);
         } finally {
             try {pdfDoc.close();} catch (final Throwable e) {}
-            writer.close();
-        }
-
-        String[] docKeywords = null;
-        if (docKeywordStr != null) {
-            docKeywords = docKeywordStr.split(" |,");
-        }
-        if (docTitle == null) {
-            docTitle = docSubject;
         }
 
         // clear resources in pdfbox. they say that is resolved but it's not. see:
@@ -201,25 +272,7 @@ public class pdfParser extends AbstractParser implements Parser {
         pdfDoc = null;
         clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
         
-        return new Document[]{new Document(
-                location,
-                mimeType,
-                "UTF-8",
-                this,
-                null,
-                docKeywords,
-                singleList(docTitle),
-                docAuthor,
-                docPublisher,
-                null,
-                null,
-                0.0f, 0.0f,
-                contentBytes,
-                (pdflinks == null || pdflinks.isEmpty()) ? null : pdflinks,
-                null,
-                null,
-                false,
-                docDate)};
+        return result;
     }
 
     /**
@@ -227,11 +280,14 @@ public class pdfParser extends AbstractParser implements Parser {
      * @param pdf the document to parse
      * @return all detected links
      */
-    private Collection<AnchorURL> extractPdfLinks(final PDDocument pdf) {
-        final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
+    private Collection<AnchorURL>[] extractPdfLinks(final PDDocument pdf) {
         @SuppressWarnings("unchecked")
         List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages();
+        @SuppressWarnings("unchecked")
+        Collection<AnchorURL>[] linkCollections = (Collection<AnchorURL>[]) new Collection<?>[allPages.size()];
+        int pagecount = 0;
         for (PDPage page : allPages) {
+            final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
             try {
                 List<PDAnnotation> annotations = page.getAnnotations();
                 if (annotations != null) {
@@ -248,8 +304,9 @@ public class pdfParser extends AbstractParser implements Parser {
                     }
                 }
             } catch (IOException ex) {}
+            linkCollections[pagecount++] = pdflinks;
         }
-        return pdflinks;
+        return linkCollections;
     }
 
     public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java
index fce4b2aca..012fa969b 100644
--- a/source/net/yacy/peers/Protocol.java
+++ b/source/net/yacy/peers/Protocol.java
@@ -392,14 +392,14 @@ public final class Protocol {
             parts.put("object", UTF8.StringBody("rwicount"));
             parts.put("ttl", UTF8.StringBody("0"));
             parts.put("env", UTF8.StringBody(""));
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "posting request to " + targetAddress);
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "posting request to " + targetAddress);
             final Post post = new Post(targetAddress, targetHash, "/yacy/query.html", parts, timeout);
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received CONTENT from requesting " + targetAddress + (post.result == null ? "NULL" : (": length = " + post.result.length)));
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received CONTENT from requesting " + targetAddress + (post.result == null ? "NULL" : (": length = " + post.result.length)));
             final Map<String, String> result = FileUtils.table(post.result);
             if (result == null || result.isEmpty()) return new long[] {-1, -1};
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESULT from requesting " + targetAddress + " : result = " + result.toString());
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESULT from requesting " + targetAddress + " : result = " + result.toString());
             final String resp = result.get("response");
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESPONSE from requesting " + targetAddress + " : response = " + resp);
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESPONSE from requesting " + targetAddress + " : response = " + resp);
             if (resp == null) return new long[] {-1, -1};
             String magic = result.get("magic");
             if (magic == null) magic = "0";
@@ -409,7 +409,7 @@ public final class Protocol {
                 return new long[] {-1, -1};
             }
         } catch (final Exception e ) {
-            ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received EXCEPTION from requesting " + targetAddress + ": " + e.getMessage());
+            //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received EXCEPTION from requesting " + targetAddress + ": " + e.getMessage());
             if (Network.log.isFine()) Network.log.fine("yacyClient.queryRWICount error:" + e.getMessage());
             return new long[] {-1, -1};
         }
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index d6c8cd336..a9278b497 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -833,6 +833,8 @@ public final class Switchboard extends serverSwitch {
                 
     	TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
         TextParser.setDenyExtension(getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
+        pdfParser.individualPages = getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
+        pdfParser.individualPagePropertyname = getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
 
         // start a loader
         this.log.config("Starting Crawl Loader");
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index 88c4873fa..c90eeec96 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -284,6 +284,8 @@ public final class SwitchboardConstants {
     public static final String INDEX_TRANSFER_GZIP_BODY         = "indexTransfer.gzipBody";
     public static final String PARSER_MIME_DENY                 = "parser.mime.deny";
     public static final String PARSER_EXTENSIONS_DENY           = "parser.extensions.deny";
+    public static final String PARSER_PDF_INDIVIDUALPAGES       = "parser.pdf.individualpages";
+    public static final String PARSER_PDF_INDIVIDUALPAGES_KEY   = "parser.pdf.individualpages.key";
     /**
      * <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
      * <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>
diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java
index e1f361ffb..1d24be921 100644
--- a/source/net/yacy/search/snippet/ResultEntry.java
+++ b/source/net/yacy/search/snippet/ResultEntry.java
@@ -27,6 +27,7 @@
 package net.yacy.search.snippet;
 
 import java.io.IOException;
+import java.net.MalformedURLException;
 import java.util.Comparator;
 import java.util.Date;
 
@@ -36,6 +37,7 @@ import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.util.ByteArray;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.Condenser;
+import net.yacy.document.parser.pdfParser;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.data.word.WordReference;
@@ -126,16 +128,28 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
         return this.urlentry.flags();
     }
     public String urlstring() {
-        return (this.alternative_urlstring == null) ? this.urlentry.url().toNormalform(true) : this.alternative_urlstring;
+        if (this.alternative_urlstring != null) return this.alternative_urlstring;
+        
+        if (!pdfParser.individualPages) return this.url().toNormalform(true);
+        if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.urlentry.url().getFileName()).toLowerCase())) return this.url().toNormalform(true);
+        // for pdf links we rewrite the url
+        // this is a special treatment of pdf files which can be splitted into subpages
+        String pageprop = pdfParser.individualPagePropertyname;
+        String resultUrlstring = this.urlentry.url().toNormalform(true);
+        int p = resultUrlstring.lastIndexOf(pageprop + "=");
+        if (p > 0) {
+          return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
+        }
+        return resultUrlstring;
     }
     public String urlname() {
-        return (this.alternative_urlname == null) ? MultiProtocolURL.unescape(this.urlentry.url().toNormalform(true)) : this.alternative_urlname;
+        return (this.alternative_urlname == null) ? MultiProtocolURL.unescape(urlstring()) : this.alternative_urlname;
     }
     public String title() {
         String titlestr = this.urlentry.dc_title();
         // if title is empty use filename as title
         if (titlestr.isEmpty()) { // if url has no filename, title is still empty (e.g. "www.host.com/" )
-            titlestr = this.urlentry.url() != null ? this.urlentry.url().getFileName() : "";
+            titlestr = this.url() != null ? this.url().getFileName() : "";
         }
         return titlestr;
     }