From 87688969753d258880fa10681868320b900fc9bc Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 6 Sep 2015 00:04:54 +0200 Subject: [PATCH] extract lastmodified from openoffice doc set lastmod date in office document parsers --- .../net/yacy/document/parser/docParser.java | 38 +++++++++---------- .../net/yacy/document/parser/odtParser.java | 4 +- .../net/yacy/document/parser/ooxmlParser.java | 4 +- .../net/yacy/document/parser/pptParser.java | 38 +++++++++---------- .../document/parser/xml/ODMetaHandler.java | 27 ++++++++++++- 5 files changed, 70 insertions(+), 41 deletions(-) diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index a33844382..f6a9af827 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -29,7 +29,6 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.ArrayList; -import java.util.Date; import java.util.List; import net.yacy.cora.document.id.AnchorURL; @@ -110,24 +109,25 @@ public class docParser extends AbstractParser implements Parser { Document[] docs; docs = new Document[]{new Document( - location, - mimeType, - "UTF-8", - this, - null, - keywlist, - singleList(title), - extractor.getSummaryInformation().getAuthor(), // constuctor can handle null - extractor.getDocSummaryInformation().getCompany(), // publisher - null, - descriptions, - 0.0f, 0.0f, - contents.toString(), - null, - null, - null, - false, - new Date())}; + location, + mimeType, + "UTF-8", + this, + null, + keywlist, + singleList(title), + extractor.getSummaryInformation().getAuthor(), // constuctor can handle null + extractor.getDocSummaryInformation().getCompany(), // publisher + null, + descriptions, + 0.0f, 0.0f, + contents.toString(), + null, + null, + null, + false, + extractor.getSummaryInformation().getLastSaveDateTime() // maybe null + )}; return docs; } diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 2f574f0c0..859f308fe 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -120,6 +120,7 @@ public class odtParser extends AbstractParser implements Parser { String docLongTitle = null; String docAuthor = null; String docLanguage = null; + Date docModified = null; // opening the file as zip file final ZipFile zipFile = new ZipFile(dest); @@ -160,6 +161,7 @@ public class odtParser extends AbstractParser implements Parser { docLongTitle = metaData.getSubject(); docAuthor = metaData.getCreator(); docLanguage = metaData.getLanguage(); + docModified = metaData.getLastModified(); // maybe null } } @@ -201,7 +203,7 @@ public class odtParser extends AbstractParser implements Parser { null, null, false, - new Date() + docModified )}; return docs; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 9072938f4..0da5b725b 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -102,6 +102,7 @@ public class ooxmlParser extends AbstractParser implements Parser { String docLongTitle = null; String docAuthor = null; String docLanguage = null; + Date docModified = null; // opening the file as zip file final ZipFile zipFile= new ZipFile(dest); @@ -145,6 +146,7 @@ public class ooxmlParser extends AbstractParser implements Parser { docLongTitle = metaData.getSubject(); docAuthor = metaData.getCreator(); docLanguage = metaData.getLanguage(); + docModified = metaData.getLastModified(); } } @@ -185,7 +187,7 @@ public class ooxmlParser extends AbstractParser implements Parser { null, null, false, - new Date())}; + docModified)}; return docs; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index f05cf8dec..b41ff3eac 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -30,7 +30,6 @@ package net.yacy.document.parser; import java.io.BufferedInputStream; import java.io.InputStream; import java.util.ArrayList; -import java.util.Date; import java.util.List; import net.yacy.cora.document.id.AnchorURL; @@ -103,24 +102,25 @@ public class pptParser extends AbstractParser implements Parser { * and set shortText and bodyText properly */ final Document[] docs = new Document[]{new Document( - location, - mimeType, - "UTF-8", - this, - null, - keywlist, - singleList(title), - pptExtractor.getSummaryInformation().getAuthor(), // may be null - pptExtractor.getDocSummaryInformation().getCompany(), - null, - descriptions, - 0.0f, 0.0f, - contents, - null, - null, - null, - false, - new Date())}; + location, + mimeType, + "UTF-8", + this, + null, + keywlist, + singleList(title), + pptExtractor.getSummaryInformation().getAuthor(), // may be null + pptExtractor.getDocSummaryInformation().getCompany(), + null, + descriptions, + 0.0f, 0.0f, + contents, + null, + null, + null, + false, + pptExtractor.getSummaryInformation().getLastSaveDateTime() // may be null + )}; return docs; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/net/yacy/document/parser/xml/ODMetaHandler.java b/source/net/yacy/document/parser/xml/ODMetaHandler.java index b068548c4..8a9a6bee8 100644 --- a/source/net/yacy/document/parser/xml/ODMetaHandler.java +++ b/source/net/yacy/document/parser/xml/ODMetaHandler.java @@ -26,6 +26,9 @@ package net.yacy.document.parser.xml; +import java.text.ParseException; +import java.util.Date; +import net.yacy.cora.date.ISO8601Formatter; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @@ -39,6 +42,7 @@ public class ODMetaHandler extends DefaultHandler { private String docSubject = null; private String docTitle = null; private String docDescription = null; + private String docLastmodified = null; public ODMetaHandler() { } @@ -67,7 +71,9 @@ public class ODMetaHandler extends DefaultHandler { this.docTitle = buffer.toString(); } else if ("dc:description".equals(tag)) { this.docDescription = buffer.toString(); - } + } else if ("dcterms:modified".equals(tag) || "dc:date".equals(tag)) { // Microsoft uses , OpenOffice + this.docLastmodified = buffer.toString(); + } } public String getCreator() { @@ -89,5 +95,24 @@ public class ODMetaHandler extends DefaultHandler { public String getDescription() { return docDescription; } + + /** + * get the last modification date of the document + * + * @return date or null + */ + public Date getLastModified() { + Date d; + if (docLastmodified != null && !docLastmodified.isEmpty()) { + try { + d = ISO8601Formatter.FORMATTER.parse(this.docLastmodified, 0).getTime(); + } catch (ParseException ex) { + d = null; + } + } else { + d = null; + } + return d; + } }