|
|
|
@ -34,7 +34,6 @@ import java.io.IOException;
|
|
|
|
|
import java.io.InputStream;
|
|
|
|
|
|
|
|
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
|
|
|
import net.yacy.cora.document.UTF8;
|
|
|
|
|
import net.yacy.document.AbstractParser;
|
|
|
|
|
import net.yacy.document.Document;
|
|
|
|
|
import net.yacy.document.Parser;
|
|
|
|
@ -127,14 +126,14 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
docTitle = MultiProtocolURI.unescape(location.getFileName());
|
|
|
|
|
}
|
|
|
|
|
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
|
|
|
|
|
byte[] contentBytes = UTF8.getBytes("");
|
|
|
|
|
byte[] contentBytes = new byte[0];
|
|
|
|
|
try {
|
|
|
|
|
// create a writer for output
|
|
|
|
|
final PDFTextStripper stripper = new PDFTextStripper();
|
|
|
|
|
|
|
|
|
|
stripper.setEndPage(3); // get first 3 pages (always)
|
|
|
|
|
writer.append(stripper.getText(pdfDoc));
|
|
|
|
|
contentBytes = UTF8.getBytes(writer.toString()); // remember text in case of interrupting thread
|
|
|
|
|
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
|
|
|
|
|
|
|
|
|
|
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
|
|
|
|
|
stripper.setEndPage(Integer.MAX_VALUE); // set to default
|
|
|
|
@ -151,7 +150,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
t.join(3000);
|
|
|
|
|
if (t.isAlive()) t.interrupt();
|
|
|
|
|
pdfDoc.close();
|
|
|
|
|
contentBytes = UTF8.getBytes(writer.toString()); // get final text before closing writer
|
|
|
|
|
contentBytes = writer.getBytes(); // get final text before closing writer
|
|
|
|
|
writer.close();
|
|
|
|
|
} catch (final IOException e) {
|
|
|
|
|
// close the writer
|
|
|
|
|