getting the trash out

pull/1/head
Michael Peter Christen 12 years ago
parent 709e9b8ce7
commit 5344a1c5f7

@ -40,7 +40,18 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDMMType1Font;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont;
import org.apache.pdfbox.pdmodel.font.PDType1CFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
@ -75,12 +86,12 @@ public class pdfParser extends AbstractParser implements Parser {
throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location); throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);
// create a pdf parser // create a pdf parser
final PDDocument pdfDoc; PDDocument pdfDoc;
//final PDFParser pdfParser; //final PDFParser pdfParser;
try { try {
Thread.currentThread().setPriority(Thread.MIN_PRIORITY); Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
pdfDoc = PDDocument.load(source); pdfDoc = PDDocument.load(source);
//pdfParser = new PDFParser(source); //PDFParser pdfParser = new PDFParser(source);
//pdfParser.parse(); //pdfParser.parse();
//pdfDoc = pdfParser.getPDDocument(); //pdfDoc = pdfParser.getPDDocument();
} catch (final IOException e) { } catch (final IOException e) {
@ -108,7 +119,7 @@ public class pdfParser extends AbstractParser implements Parser {
} }
// extracting some metadata // extracting some metadata
final PDDocumentInformation info = pdfDoc.getDocumentInformation(); PDDocumentInformation info = pdfDoc.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
if (info != null) { if (info != null) {
docTitle = info.getTitle(); docTitle = info.getTitle();
@ -122,6 +133,7 @@ public class pdfParser extends AbstractParser implements Parser {
// info.getCreationDate()); // info.getCreationDate());
// info.getModificationDate(); // info.getModificationDate();
} }
info = null;
if (docTitle == null || docTitle.isEmpty()) { if (docTitle == null || docTitle.isEmpty()) {
docTitle = MultiProtocolURI.unescape(location.getFileName()); docTitle = MultiProtocolURI.unescape(location.getFileName());
@ -139,12 +151,13 @@ public class pdfParser extends AbstractParser implements Parser {
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
stripper.setEndPage(Integer.MAX_VALUE); // set to default stripper.setEndPage(Integer.MAX_VALUE); // set to default
// we start the pdf parsing in a separate thread to ensure that it can be terminated // we start the pdf parsing in a separate thread to ensure that it can be terminated
final PDDocument pdfDocC = pdfDoc;
final Thread t = new Thread() { final Thread t = new Thread() {
@Override @Override
public void run() { public void run() {
Thread.currentThread().setName("pdfParser.getText:" + location); Thread.currentThread().setName("pdfParser.getText:" + location);
try { try {
writer.append(stripper.getText(pdfDoc)); writer.append(stripper.getText(pdfDocC));
} catch (final Throwable e) {} } catch (final Throwable e) {}
} }
}; };
@ -181,8 +194,9 @@ public class pdfParser extends AbstractParser implements Parser {
// COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
// the great number of these objects can easily be seen in Java Visual VM // the great number of these objects can easily be seen in Java Visual VM
// we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
COSName.clearResources(); pdfDoc = null;
PDFont.clearResources(); clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
return new Document[]{new Document( return new Document[]{new Document(
location, location,
mimeType, mimeType,
@ -203,6 +217,26 @@ public class pdfParser extends AbstractParser implements Parser {
false)}; false)};
} }
@SuppressWarnings("static-access")
public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
// Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain.
PDFont.clearResources();
COSName.clearResources();
PDType1Font.clearResources();
PDTrueTypeFont.clearResources();
PDType0Font.clearResources();
PDType1AfmPfbFont.clearResources();
PDType3Font.clearResources();
PDType1CFont.clearResources();
PDCIDFont.clearResources();
PDCIDFontType0Font.clearResources();
PDCIDFontType2Font.clearResources();
PDMMType1Font.clearResources();
PDSimpleFont.clearResources();
}
/** /**
* test * test
* @param args * @param args

@ -29,6 +29,7 @@ package net.yacy.kelondro.data.word;
import java.util.Collection; import java.util.Collection;
import java.util.Locale; import java.util.Locale;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest; import net.yacy.cora.order.Digest;
import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ARC;
@ -86,6 +87,10 @@ public class Word {
this.flags = null; this.flags = null;
} }
public static void clearCache() {
hashCache.clear();
}
public void inc() { public void inc() {
this.count++; this.count++;
} }

@ -81,6 +81,7 @@ import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream; import java.util.zip.ZipInputStream;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
@ -151,6 +152,7 @@ import net.yacy.document.content.DCEntry;
import net.yacy.document.content.SurrogateReader; import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.parser.audioTagParser; import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.html.Evaluation; import net.yacy.document.parser.html.Evaluation;
import net.yacy.gui.Tray; import net.yacy.gui.Tray;
import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables;
@ -2035,10 +2037,11 @@ public final class Switchboard extends serverSwitch {
try { try {
// flush caches in used libraries // flush caches in used libraries
PDFont.clearResources(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu
// clear caches // clear caches
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords(); if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
Word.clearCache();
Domains.clear(); Domains.clear();
// clean up image stack // clean up image stack

Loading…
Cancel
Save