From cb2c17d236188693f99e8a8e0e8df6fdaee57b1f Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 29 Jun 2014 02:54:09 +0200 Subject: [PATCH] extract author and keywords in .doc and .ppt parser --- source/net/yacy/document/parser/docParser.java | 12 ++++++++++-- source/net/yacy/document/parser/pptParser.java | 10 ++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 8c0263ac3..a09f9e391 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -86,6 +86,14 @@ public class docParser extends AbstractParser implements Parser { if (title.length() == l) break; l = title.length(); } + // get keywords (for yacy as array) + final String keywords = extractor.getSummaryInformation().getKeywords(); + final String[] keywlist; + if (keywords != null && !keywords.isEmpty()) { + keywlist = keywords.split(","); + } else { + keywlist = null; + } Document[] docs; docs = new Document[]{new Document( @@ -94,9 +102,9 @@ public class docParser extends AbstractParser implements Parser { "UTF-8", this, null, - null, + keywlist, singleList(title), - "", // TODO: AUTHOR + extractor.getSummaryInformation().getAuthor(), // constuctor can handle null extractor.getDocSummaryInformation().getCompany(), // publisher null, null, diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index f21f188bf..e0773f2ba 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -78,6 +78,12 @@ public class pptParser extends AbstractParser implements Parser { if (title.length() == l) break; l = title.length(); } + // get keywords (for yacy as array) + final String keywords = pptExtractor.getSummaryInformation().getKeywords(); + final String[] keywlist; + if (keywords != null && !keywords.isEmpty()) { + keywlist = keywords.split(","); + } else keywlist = null; /* * create the plasmaParserDocument for the database @@ -89,9 +95,9 @@ public class pptParser extends AbstractParser implements Parser { "UTF-8", this, null, - null, + keywlist, singleList(title), - "", // TODO: AUTHOR + pptExtractor.getSummaryInformation().getAuthor(), // may be null pptExtractor.getDocSummaryInformation().getCompany(), null, null,