diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index 936123360..240f8239d 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -57,7 +57,7 @@ public class SitemapImporter extends Thread { @Override public void run() { try { - logger.info("Start parsing sitemap file " + this.siteMapURL); + logger.info("Start parsing sitemap file " + this.siteMapURL.toNormalform(true)); sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL, this.crawlingProfile.getAgent()); parser.start(); URLEntry item; diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 7b5daaba7..24af35411 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -36,6 +36,7 @@ import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collection; import java.util.Date; +import java.util.HashSet; import java.util.List; import org.apache.pdfbox.exceptions.CryptographyException; @@ -51,6 +52,7 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.util.PDFTextStripper; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; @@ -64,6 +66,9 @@ import net.yacy.kelondro.util.MemoryControl; public class pdfParser extends AbstractParser implements Parser { + public static boolean individualPages = false; + public static String individualPagePropertyname = "page"; + public pdfParser() { super("Acrobat Portable Document Parser"); this.SUPPORTED_EXTENSIONS.add("pdf"); @@ -78,7 +83,7 @@ public class pdfParser extends AbstractParser implements Parser { static { clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // must be called here to get that into the class loader; it will block other threads otherwise; } - + @Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { @@ -141,51 +146,117 @@ public class pdfParser extends AbstractParser implements Parser { if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } - final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); - byte[] contentBytes = new byte[0]; - Collection pdflinks = null; + if (docTitle == null) { + docTitle = docSubject; + } + String[] docKeywords = null; + if (docKeywordStr != null) { + docKeywords = docKeywordStr.split(" |,"); + } + + Collection[] pdflinks = null; + Document[] result = null; try { - // create a writer for output - final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); + // get the links + pdflinks = extractPdfLinks(pdfDoc); + + // get the fulltext (either per document or for each page) + final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); + + if (individualPages) { + // this is a hack which stores individual pages of the source pdf into individual index documents + // the new documents will get a virtual link with a post argument page=X appended to the original url + + // collect text + int pagecount = pdfDoc.getNumberOfPages(); + String[] pages = new String[pagecount]; + for (int page = 1; page <= pagecount; page++) { + stripper.setStartPage(page); + stripper.setEndPage(page); + pages[page - 1] = stripper.getText(pdfDoc); + System.out.println("PAGE " + page + ": " + pages[page - 1]); + } + + // create individual documents for each page + assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; + result = new Document[Math.min(pages.length, pdflinks.length)]; + String loc = location.toNormalform(true); + for (int page = 0; page < result.length; page++) { + result[page] = new Document( + new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash + mimeType, + "UTF-8", + this, + null, + docKeywords, + singleList(docTitle), + docAuthor, + docPublisher, + null, + null, + 0.0f, 0.0f, + pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), + pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], + null, + null, + false, + docDate); + } + } else { + // collect the whole text at once + final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); + byte[] contentBytes = new byte[0]; + stripper.setEndPage(3); // get first 3 pages (always) + writer.append(stripper.getText(pdfDoc)); + contentBytes = writer.getBytes(); // remember text in case of interrupting thread - stripper.setEndPage(3); // get first 3 pages (always) - writer.append(stripper.getText(pdfDoc)); - contentBytes = writer.getBytes(); // remember text in case of interrupting thread + if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read + stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) + stripper.setEndPage(Integer.MAX_VALUE); // set to default + // we start the pdf parsing in a separate thread to ensure that it can be terminated + final PDDocument pdfDocC = pdfDoc; + final Thread t = new Thread() { + @Override + public void run() { + Thread.currentThread().setName("pdfParser.getText:" + location); + try { + writer.append(stripper.getText(pdfDocC)); + } catch (final Throwable e) {} + } + }; + t.start(); + t.join(3000); // pdfbox likes to forget to terminate ... (quite often) + if (t.isAlive()) t.interrupt(); + } + contentBytes = writer.getBytes(); // get final text before closing writer - if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read - stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) - stripper.setEndPage(Integer.MAX_VALUE); // set to default - // we start the pdf parsing in a separate thread to ensure that it can be terminated - final PDDocument pdfDocC = pdfDoc; - final Thread t = new Thread() { - @Override - public void run() { - Thread.currentThread().setName("pdfParser.getText:" + location); - try { - writer.append(stripper.getText(pdfDocC)); - } catch (final Throwable e) {} - } - }; - t.start(); - t.join(3000); - if (t.isAlive()) t.interrupt(); - } - contentBytes = writer.getBytes(); // get final text before closing writer - pdflinks = extractPdfLinks(pdfDoc); + Collection pdflinksCombined = new HashSet(); + for (Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); + result = new Document[]{new Document( + location, + mimeType, + "UTF-8", + this, + null, + docKeywords, + singleList(docTitle), + docAuthor, + docPublisher, + null, + null, + 0.0f, 0.0f, + contentBytes, + pdflinksCombined, + null, + null, + false, + docDate)}; + } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try {pdfDoc.close();} catch (final Throwable e) {} - writer.close(); - } - - String[] docKeywords = null; - if (docKeywordStr != null) { - docKeywords = docKeywordStr.split(" |,"); - } - if (docTitle == null) { - docTitle = docSubject; } // clear resources in pdfbox. they say that is resolved but it's not. see: @@ -201,25 +272,7 @@ public class pdfParser extends AbstractParser implements Parser { pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); - return new Document[]{new Document( - location, - mimeType, - "UTF-8", - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0f, 0.0f, - contentBytes, - (pdflinks == null || pdflinks.isEmpty()) ? null : pdflinks, - null, - null, - false, - docDate)}; + return result; } /** @@ -227,11 +280,14 @@ public class pdfParser extends AbstractParser implements Parser { * @param pdf the document to parse * @return all detected links */ - private Collection extractPdfLinks(final PDDocument pdf) { - final Collection pdflinks = new ArrayList(); + private Collection[] extractPdfLinks(final PDDocument pdf) { @SuppressWarnings("unchecked") List allPages = pdf.getDocumentCatalog().getAllPages(); + @SuppressWarnings("unchecked") + Collection[] linkCollections = (Collection[]) new Collection[allPages.size()]; + int pagecount = 0; for (PDPage page : allPages) { + final Collection pdflinks = new ArrayList(); try { List annotations = page.getAnnotations(); if (annotations != null) { @@ -248,8 +304,9 @@ public class pdfParser extends AbstractParser implements Parser { } } } catch (IOException ex) {} + linkCollections[pagecount++] = pdflinks; } - return pdflinks; + return linkCollections; } public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() { diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index fce4b2aca..012fa969b 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -392,14 +392,14 @@ public final class Protocol { parts.put("object", UTF8.StringBody("rwicount")); parts.put("ttl", UTF8.StringBody("0")); parts.put("env", UTF8.StringBody("")); - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "posting request to " + targetAddress); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "posting request to " + targetAddress); final Post post = new Post(targetAddress, targetHash, "/yacy/query.html", parts, timeout); - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received CONTENT from requesting " + targetAddress + (post.result == null ? "NULL" : (": length = " + post.result.length))); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received CONTENT from requesting " + targetAddress + (post.result == null ? "NULL" : (": length = " + post.result.length))); final Map result = FileUtils.table(post.result); if (result == null || result.isEmpty()) return new long[] {-1, -1}; - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESULT from requesting " + targetAddress + " : result = " + result.toString()); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESULT from requesting " + targetAddress + " : result = " + result.toString()); final String resp = result.get("response"); - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESPONSE from requesting " + targetAddress + " : response = " + resp); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received RESPONSE from requesting " + targetAddress + " : response = " + resp); if (resp == null) return new long[] {-1, -1}; String magic = result.get("magic"); if (magic == null) magic = "0"; @@ -409,7 +409,7 @@ public final class Protocol { return new long[] {-1, -1}; } } catch (final Exception e ) { - ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received EXCEPTION from requesting " + targetAddress + ": " + e.getMessage()); + //ConcurrentLog.info("**hello-DEBUG**queryRWICount**", "received EXCEPTION from requesting " + targetAddress + ": " + e.getMessage()); if (Network.log.isFine()) Network.log.fine("yacyClient.queryRWICount error:" + e.getMessage()); return new long[] {-1, -1}; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d6c8cd336..a9278b497 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -833,6 +833,8 @@ public final class Switchboard extends serverSwitch { TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); TextParser.setDenyExtension(getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, "")); + pdfParser.individualPages = getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); + pdfParser.individualPagePropertyname = getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); // start a loader this.log.config("Starting Crawl Loader"); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 88c4873fa..c90eeec96 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -284,6 +284,8 @@ public final class SwitchboardConstants { public static final String INDEX_TRANSFER_GZIP_BODY = "indexTransfer.gzipBody"; public static final String PARSER_MIME_DENY = "parser.mime.deny"; public static final String PARSER_EXTENSIONS_DENY = "parser.extensions.deny"; + public static final String PARSER_PDF_INDIVIDUALPAGES = "parser.pdf.individualpages"; + public static final String PARSER_PDF_INDIVIDUALPAGES_KEY = "parser.pdf.individualpages.key"; /** *

public static final String PROXY_ONLINE_CAUTION_DELAY = "onlineCautionDelay"

*

Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds

diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java index e1f361ffb..1d24be921 100644 --- a/source/net/yacy/search/snippet/ResultEntry.java +++ b/source/net/yacy/search/snippet/ResultEntry.java @@ -27,6 +27,7 @@ package net.yacy.search.snippet; import java.io.IOException; +import java.net.MalformedURLException; import java.util.Comparator; import java.util.Date; @@ -36,6 +37,7 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Condenser; +import net.yacy.document.parser.pdfParser; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -126,16 +128,28 @@ public class ResultEntry implements Comparable, Comparator