From 8c3e5b7b6d091dda8e512cde50127c54070df5f3 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 21 Dec 2014 18:10:15 +0100 Subject: [PATCH] added experimental pdf splitting which enables YaCy to split pdfs during parsing into individual pages and add them all using different URLs. These constructed urls are generated from the source url with an appended page= attribute to the url get/post properties. This will distinguish the different page entries. The search result list will then replace the post parameter with a url anchor # mark which causes that the original url is presented in the search result. These URLs can be opened directly on the correct page using pdf.js which is now built-in into firefox. That means: if you find a search hit on page 5 and click on the search result, firefox will open the pdf viewer and shows page 5. --- .../crawler/retrieval/SitemapImporter.java | 2 +- .../net/yacy/document/parser/pdfParser.java | 177 ++++++++++++------ source/net/yacy/peers/Protocol.java | 10 +- source/net/yacy/search/Switchboard.java | 2 + .../net/yacy/search/SwitchboardConstants.java | 2 + .../net/yacy/search/snippet/ResultEntry.java | 20 +- 6 files changed, 144 insertions(+), 69 deletions(-) diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index 936123360..240f8239d 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -57,7 +57,7 @@ public class SitemapImporter extends Thread { @Override public void run() { try { - logger.info("Start parsing sitemap file " + this.siteMapURL); + logger.info("Start parsing sitemap file " + this.siteMapURL.toNormalform(true)); sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL, this.crawlingProfile.getAgent()); parser.start(); URLEntry item; diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 7b5daaba7..24af35411 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -36,6 +36,7 @@ import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collection; import java.util.Date; +import java.util.HashSet; import java.util.List; import org.apache.pdfbox.exceptions.CryptographyException; @@ -51,6 +52,7 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.util.PDFTextStripper; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; @@ -64,6 +66,9 @@ import net.yacy.kelondro.util.MemoryControl; public class pdfParser extends AbstractParser implements Parser { + public static boolean individualPages = false; + public static String individualPagePropertyname = "page"; + public pdfParser() { super("Acrobat Portable Document Parser"); this.SUPPORTED_EXTENSIONS.add("pdf"); @@ -78,7 +83,7 @@ public class pdfParser extends AbstractParser implements Parser { static { clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // must be called here to get that into the class loader; it will block other threads otherwise; } - + @Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { @@ -141,51 +146,117 @@ public class pdfParser extends AbstractParser implements Parser { if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } - final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); - byte[] contentBytes = new byte[0]; - Collection pdflinks = null; + if (docTitle == null) { + docTitle = docSubject; + } + String[] docKeywords = null; + if (docKeywordStr != null) { + docKeywords = docKeywordStr.split(" |,"); + } + + Collection[] pdflinks = null; + Document[] result = null; try { - // create a writer for output - final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); + // get the links + pdflinks = extractPdfLinks(pdfDoc); + + // get the fulltext (either per document or for each page) + final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); + + if (individualPages) { + // this is a hack which stores individual pages of the source pdf into individual index documents + // the new documents will get a virtual link with a post argument page=X appended to the original url + + // collect text + int pagecount = pdfDoc.getNumberOfPages(); + String[] pages = new String[pagecount]; + for (int page = 1; page <= pagecount; page++) { + stripper.setStartPage(page); + stripper.setEndPage(page); + pages[page - 1] = stripper.getText(pdfDoc); + System.out.println("PAGE " + page + ": " + pages[page - 1]); + } + + // create individual documents for each page + assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; + result = new Document[Math.min(pages.length, pdflinks.length)]; + String loc = location.toNormalform(true); + for (int page = 0; page < result.length; page++) { + result[page] = new Document( + new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash + mimeType, + "UTF-8", + this, + null, + docKeywords, + singleList(docTitle), + docAuthor, + docPublisher, + null, + null, + 0.0f, 0.0f, + pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), + pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], + null, + null, + false, + docDate); + } + } else { + // collect the whole text at once + final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); + byte[] contentBytes = new byte[0]; + stripper.setEndPage(3); // get first 3 pages (always) + writer.append(stripper.getText(pdfDoc)); + contentBytes = writer.getBytes(); // remember text in case of interrupting thread - stripper.setEndPage(3); // get first 3 pages (always) - writer.append(stripper.getText(pdfDoc)); - contentBytes = writer.getBytes(); // remember text in case of interrupting thread + if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read + stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) + stripper.setEndPage(Integer.MAX_VALUE); // set to default + // we start the pdf parsing in a separate thread to ensure that it can be terminated + final PDDocument pdfDocC = pdfDoc; + final Thread t = new Thread() { + @Override + public void run() { + Thread.currentThread().setName("pdfParser.getText:" + location); + try { + writer.append(stripper.getText(pdfDocC)); + } catch (final Throwable e) {} + } + }; + t.start(); + t.join(3000); // pdfbox likes to forget to terminate ... (quite often) + if (t.isAlive()) t.interrupt(); + } + contentBytes = writer.getBytes(); // get final text before closing writer - if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read - stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) - stripper.setEndPage(Integer.MAX_VALUE); // set to default - // we start the pdf parsing in a separate thread to ensure that it can be terminated - final PDDocument pdfDocC = pdfDoc; - final Thread t = new Thread() { - @Override - public void run() { - Thread.currentThread().setName("pdfParser.getText:" + location); - try { - writer.append(stripper.getText(pdfDocC)); - } catch (final Throwable e) {} - } - }; - t.start(); - t.join(3000); - if (t.isAlive()) t.interrupt(); - } - contentBytes = writer.getBytes(); // get final text before closing writer - pdflinks = extractPdfLinks(pdfDoc); + Collection pdflinksCombined = new HashSet(); + for (Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); + result = new Document[]{new Document( + location, + mimeType, + "UTF-8", + this, + null, + docKeywords, + singleList(docTitle), + docAuthor, + docPublisher, + null, + null, + 0.0f, 0.0f, + contentBytes, + pdflinksCombined, + null, + null, + false, + docDate)}; + } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try {pdfDoc.close();} catch (final Throwable e) {} - writer.close(); - } - - String[] docKeywords = null; - if (docKeywordStr != null) { - docKeywords = docKeywordStr.split(" |,"); - } - if (docTitle == null) { - docTitle = docSubject; } // clear resources in pdfbox. they say that is resolved but it's not. see: @@ -201,25 +272,7 @@ public class pdfParser extends AbstractParser implements Parser { pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); - return new Document[]{new Document( - location, - mimeType, - "UTF-8", - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0f, 0.0f, - contentBytes, - (pdflinks == null || pdflinks.isEmpty()) ? null : pdflinks, - null, - null, - false, - docDate)}; + return result; } /** @@ -227,11 +280,14 @@ public class pdfParser extends AbstractParser implements Parser { * @param pdf the document to parse * @return all detected links */ - private Collection extractPdfLinks(final PDDocument pdf) { - final Collection pdflinks = new ArrayList(); + private Collection[] extractPdfLinks(final PDDocument pdf) { @SuppressWarnings("unchecked") List allPages = pdf.getDocumentCatalog().getAllPages(); + @SuppressWarnings("unchecked") + Collection[] linkCollections = (Collection[]) new Collection[allPages.size()]; + int pagecount = 0; for (PDPage page : allPages) { + final Collection pdflinks = new ArrayList(); try { List annotations = page.getAnnotations(); if (annotations != null) { @@ -248,8 +304,9 @@ public class pdfParser extends AbstractParser implements Parser { } } } catch (IOException ex) {} + linkCollections[pagecount++] = pdflinks; } - return pdflinks; + return linkCollections; } public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() { diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index fce4b2aca..012fa969b 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -392,14 +392,14 @@ public final class Protocol { parts.put("object", UTF8.StringBody("rwicount")); parts.put("ttl", UTF8.StringBody("0")); parts.put("env", UTF8.StringBody("")); - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "posting request to " + targetAddress); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "posting request to " + targetAddress); final Post post = new Post(targetAddress, targetHash, "/yacy/query.html", parts, timeout); - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received CONTENT from requesting " + targetAddress + (post.result == null ? "NULL" : (": length = " + post.result.length))); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received CONTENT from requesting " + targetAddress + (post.result == null ? "NULL" : (": length = " + post.result.length))); final Map result = FileUtils.table(post.result); if (result == null || result.isEmpty()) return new long[] {-1, -1}; - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESULT from requesting " + targetAddress + " : result = " + result.toString()); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESULT from requesting " + targetAddress + " : result = " + result.toString()); final String resp = result.get("response"); - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESPONSE from requesting " + targetAddress + " : response = " + resp); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESPONSE from requesting " + targetAddress + " : response = " + resp); if (resp == null) return new long[] {-1, -1}; String magic = result.get("magic"); if (magic == null) magic = "0"; @@ -409,7 +409,7 @@ public final class Protocol { return new long[] {-1, -1}; } } catch (final Exception e ) { - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received EXCEPTION from requesting " + targetAddress + ": " + e.getMessage()); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received EXCEPTION from requesting " + targetAddress + ": " + e.getMessage()); if (Network.log.isFine()) Network.log.fine("yacyClient.queryRWICount error:" + e.getMessage()); return new long[] {-1, -1}; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d6c8cd336..a9278b497 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -833,6 +833,8 @@ public final class Switchboard extends serverSwitch { TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); TextParser.setDenyExtension(getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, "")); + pdfParser.individualPages = getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); + pdfParser.individualPagePropertyname = getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); // start a loader this.log.config("Starting Crawl Loader"); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 88c4873fa..c90eeec96 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -284,6 +284,8 @@ public final class SwitchboardConstants { public static final String INDEX_TRANSFER_GZIP_BODY = "indexTransfer.gzipBody"; public static final String PARSER_MIME_DENY = "parser.mime.deny"; public static final String PARSER_EXTENSIONS_DENY = "parser.extensions.deny"; + public static final String PARSER_PDF_INDIVIDUALPAGES = "parser.pdf.individualpages"; + public static final String PARSER_PDF_INDIVIDUALPAGES_KEY = "parser.pdf.individualpages.key"; /** *

public static final String PROXY_ONLINE_CAUTION_DELAY = "onlineCautionDelay"

*

Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds

diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java index e1f361ffb..1d24be921 100644 --- a/source/net/yacy/search/snippet/ResultEntry.java +++ b/source/net/yacy/search/snippet/ResultEntry.java @@ -27,6 +27,7 @@ package net.yacy.search.snippet; import java.io.IOException; +import java.net.MalformedURLException; import java.util.Comparator; import java.util.Date; @@ -36,6 +37,7 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Condenser; +import net.yacy.document.parser.pdfParser; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -126,16 +128,28 @@ public class ResultEntry implements Comparable, Comparator