upgraded ppt parser by migration of org.apache,poi from 3.17 to 5.3.0

This also fixes the security waning
https://github.com/yacy/yacy_search_server/security/dependabot/37
pull/657/head
Michael Peter Christen 4 months ago
parent 687820788d
commit 833d720989

@ -60,8 +60,8 @@
<dependency org="org.apache.lucene" name="lucene-suggest" rev="9.0.0"/> <dependency org="org.apache.lucene" name="lucene-suggest" rev="9.0.0"/>
<dependency org="org.apache.pdfbox" name="pdfbox" rev="3.0.2" /> <dependency org="org.apache.pdfbox" name="pdfbox" rev="3.0.2" />
<dependency org="org.apache.poi" name="poi" rev="3.17" /> <dependency org="org.apache.poi" name="poi" rev="5.3.0" />
<dependency org="org.apache.poi" name="poi-scratchpad" rev="3.17" /> <dependency org="org.apache.poi" name="poi-scratchpad" rev="5.3.0" />
<dependency org="org.apache.solr" name="solr-core" rev="9.0.0" conf="compile->master"/> <dependency org="org.apache.solr" name="solr-core" rev="9.0.0" conf="compile->master"/>
<dependency org="org.apache.solr" name="solr-scripting" rev="9.0.0" conf="compile->master"/> <dependency org="org.apache.solr" name="solr-scripting" rev="9.0.0" conf="compile->master"/>

@ -32,8 +32,14 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.List; import java.util.List;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
@ -42,8 +48,6 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
public class pptParser extends AbstractParser implements Parser { public class pptParser extends AbstractParser implements Parser {
public pptParser(){ public pptParser(){
@ -69,40 +73,47 @@ public class pptParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream source) throws Parser.Failure, final InputStream source) throws Parser.Failure, InterruptedException {
InterruptedException {
try { try {
/* final BufferedInputStream bis = new BufferedInputStream(source);
* create new PowerPointExtractor and extract text and notes final HSLFSlideShow slideShow = new HSLFSlideShow(bis);
* of the document final SummaryInformation summaryInfo = slideShow.getSummaryInformation();
*/ final DocumentSummaryInformation docSummaryInfo = slideShow.getDocumentSummaryInformation();
final PowerPointExtractor pptExtractor = new PowerPointExtractor(new BufferedInputStream(source)); @SuppressWarnings({ "rawtypes", "unchecked" })
final String contents = pptExtractor.getText(true, true).trim(); final SlideShowExtractor<?,?> pptExtractor = new SlideShowExtractor(slideShow);
String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim(); final String contents = pptExtractor.getText().trim();
if (title.length() > 80) title = title.substring(0, 80);
int l = title.length(); String title = summaryInfo == null ? "" : summaryInfo.getTitle();
while (true) { if (title.length() == 0) {
title = title.replaceAll(" ", " "); title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
if (title.length() == l) break; if (title.length() > 80) title = title.substring(0, 80);
l = title.length(); int l = title.length();
while (true) {
title = title.replaceAll(" ", " ");
if (title.length() == l) break;
l = title.length();
}
} }
// get keywords (for yacy as array)
final String keywords = pptExtractor.getSummaryInformation().getKeywords(); final String author = summaryInfo == null ? "" : summaryInfo.getAuthor();
final String keywords = summaryInfo == null ? "" : summaryInfo.getKeywords();
final String subject = summaryInfo == null ? "" : summaryInfo.getSubject();
//final String comments = summaryInfo == null ? "" : summaryInfo.getComments();
final Date lastSaveDate = summaryInfo == null ? null : summaryInfo.getLastSaveDateTime();
//final String category = docSummaryInfo == null ? "" : docSummaryInfo.getCategory();
final String company = docSummaryInfo == null ? "" : docSummaryInfo.getCompany();
//final String manager = docSummaryInfo == null ? "" : docSummaryInfo.getManager();
final String[] keywlist; final String[] keywlist;
if (keywords != null && !keywords.isEmpty()) { if (keywords != null && !keywords.isEmpty()) {
keywlist = CommonPattern.COMMA.split(keywords); keywlist = CommonPattern.COMMA.split(keywords);
} else keywlist = null; } else keywlist = null;
final String subject = pptExtractor.getSummaryInformation().getSubject(); final List<String> descriptions = new ArrayList<>();
List<String> descriptions = new ArrayList<String>();
if (subject != null && !subject.isEmpty()) descriptions.add(subject); if (subject != null && !subject.isEmpty()) descriptions.add(subject);
/*
* create the plasmaParserDocument for the database
* and set shortText and bodyText properly
*/
final Document[] docs = new Document[]{new Document( final Document[] docs = new Document[]{new Document(
location, location,
mimeType, mimeType,
@ -111,8 +122,8 @@ public class pptParser extends AbstractParser implements Parser {
null, null,
keywlist, keywlist,
singleList(title), singleList(title),
pptExtractor.getSummaryInformation().getAuthor(), // may be null author, // may be null
pptExtractor.getDocSummaryInformation().getCompany(), company,
null, null,
descriptions, descriptions,
0.0d, 0.0d, 0.0d, 0.0d,
@ -121,9 +132,9 @@ public class pptParser extends AbstractParser implements Parser {
null, null,
null, null,
false, false,
pptExtractor.getSummaryInformation().getLastSaveDateTime() // may be null lastSaveDate // may be null
)}; )};
try {pptExtractor.close();} catch (IOException e1) {} try {pptExtractor.close();} catch (final IOException e1) {}
return docs; return docs;
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;

Loading…
Cancel
Save