Added and integrated new date detection class which can identify date

notions within the fulltext of a document. This class attempts to identify also dates given abbreviated or with missing year or described with names for special days, like 'Halloween'. In case that a date has no year given, the current year and following years are considered. This process is therefore able to identify a large set of dates to a document, either because there are several dates given in the document or the date is ambiguous. Four new Solr fields are used to store the parsing result: dates_in_content_sxt: if date expressions can be found in the content, these dates are listed here in order of the appearances dates_in_content_count_i: the number of entries in dates_in_content_sxt date_in_content_min_dt: if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates #date_in_content_max_dt: if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future These fields are deactiviated by default because the evaluation of regular expressions to detect the date is yet too CPU intensive. Maybe future enhancements will cause that this is switched on by default. The purpose of these fields is the creation of calendar-like search facets, to be implemented next.
10 years ago · 66b5a56976
parent c3c2b6999b
commit 66b5a56976
12 changed files with 70 additions and 21 deletions
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@ -18,16 +18,16 @@ sku
 ## last-modified from http header, date (mandatory field)
 last_modified

-## if date expressions can be found in the content, these dates are listed here in order of the appearances"),
+## if date expressions can be found in the content, these dates are listed here in order of the appearances
 #dates_in_content_sxt

 ## the number of entries in dates_in_content_sxt
 #dates_in_content_count_i    

-## if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates"),
+## if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates
 #date_in_content_min_dt

-## if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future"),
+## if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future
 #date_in_content_max_dt
    
 ## mime-type of document, string (mandatory field)
--- a/source/net/yacy/cora/date/GenericFormatter.java
+++ b/source/net/yacy/cora/date/GenericFormatter.java
@ -88,7 +88,7 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter
     * Note: The short day format doesn't include any timezone information. This method
     * transforms the date into the GMT/UTC timezone. Example: If the local system time is,
     * 2007-12-18 01:15:00 +0200, then the resulting String will be "2007-12-17".
-     * In case you need a format with a timezon offset, use {@link #formatShortDay(TimeZone)}
+     * In case you need a format with a timezone offset, use {@link #formatShortDay(TimeZone)}
     * @return a String representation of the current system date in GMT using the
     *         short day format, e.g. "20071218".
     */
--- a/source/net/yacy/crawler/data/Transactions.java
+++ b/source/net/yacy/crawler/data/Transactions.java
@ -79,7 +79,7 @@ public class Transactions {
        archive = new Snapshots(archiveDir);
    }

-    public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent) {
+    public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {

        // GET METADATA FROM DOC
        final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@ -132,7 +132,7 @@ public class Transactions {
                    public void run() {
                        executorRunning.incrementAndGet();
                        try {
-                            Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath);
+                            Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath);
                        } catch (Throwable e) {} finally {
                        executorRunning.decrementAndGet();
                        }
@ -140,7 +140,7 @@ public class Transactions {
                };
                executor.execute(t);
            } else {
-                success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath);
+                success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath);
            }
        }
        
--- a/source/net/yacy/data/ymark/YMarkAutoTagger.java
+++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java
@ -86,7 +86,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
 		}

 		//get words from document
-		final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words();
+		final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false, false).words();

 		// generate potential tags from document title, description and subject
 		final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -25,6 +25,7 @@ import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@ -85,13 +86,15 @@ public final class Condenser {
    //public int RESULT_DIFF_SENTENCES = -1;
    public Bitfield RESULT_FLAGS = new Bitfield(4);
    private final Identificator languageIdentificator;
-
+    public LinkedHashSet<Date> dates_in_content;
+    
    public Condenser(
            final Document document,
            final boolean indexText,
            final boolean indexMedia,
            final WordCache meaningLib,
-            final boolean doAutotagging
+            final boolean doAutotagging,
+            final boolean findDatesInContent
            ) {
        Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
        // if addMedia == true, then all the media links are also parsed and added to the words
@ -99,7 +102,8 @@ public final class Condenser {
        this.words = new HashMap<String, Word>();
        this.synonyms = new LinkedHashSet<String>();
        this.RESULT_FLAGS = new Bitfield(4);
-
+        this.dates_in_content = new LinkedHashSet<Date>();
+        
        // construct flag set for document
        ContentDomain contentDomain = document.getContentDomain();
        if (contentDomain == ContentDomain.IMAGE || !document.getImages().isEmpty())     this.RESULT_FLAGS.set(flag_cat_hasimage, true);
@ -115,7 +119,9 @@ public final class Condenser {

        Map.Entry<AnchorURL, String> entry;
        if (indexText) {
-            createCondensement(document.getTextString(), meaningLib, doAutotagging);
+            String text = document.getTextString();
+            if (findDatesInContent) this.dates_in_content = DateDetection.parse(text);
+            createCondensement(text, meaningLib, doAutotagging);
            // the phrase counter:
            // phrase   0 are words taken from the URL
            // phrase   1 is the MainTitle
--- a/source/net/yacy/document/content/SurrogateReader.java
+++ b/source/net/yacy/document/content/SurrogateReader.java
@ -136,9 +136,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
    public void startElement(final String uri, final String name, String tag, final Attributes atts) throws SAXException {
        if (tag == null) return;
        tag = tag.toLowerCase();
-        if ("record".equals(tag) || "document".equals(tag)) {
+        if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) {
            this.surrogate = new DCEntry();
-        } else if ("element".equals(tag)) {
+        } else if ("element".equals(tag) || "str".equals(tag) || "int".equals(tag) || "bool".equals(tag) || "long".equals(tag)) {
            this.elementName = atts.getValue("name");
        } else if ("value".equals(tag)) {
            this.buffer.setLength(0);
@ -154,7 +154,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
    public void endElement(final String uri, final String name, String tag) {
        if (tag == null) return;
        tag = tag.toLowerCase();
-        if ("record".equals(tag) || "document".equals(tag)) {
+        if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) {
            //System.out.println("A Title: " + this.surrogate.title());
            try {
                this.surrogates.put(this.surrogate);
@ -169,6 +169,12 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        } else if ("element".equals(tag)) {
            this.buffer.setLength(0);
            this.parsingValue = false;
+        } else if ("str".equals(tag) || "int".equals(tag) || "bool".equals(tag) || "long".equals(tag)){
+            final String value = buffer.toString().trim();
+            if (this.elementName != null) {
+                this.surrogate.getMap().put(this.elementName, new String[]{value});
+            }
+            this.parsingValue = false;
        } else if ("value".equals(tag)) {
            //System.out.println("BUFFER-SIZE=" + buffer.length());
            final String value = buffer.toString().trim();
--- a/source/net/yacy/document/parser/torrentParser.java
+++ b/source/net/yacy/document/parser/torrentParser.java
@ -120,7 +120,7 @@ public class torrentParser extends AbstractParser implements Parser {
            byte[] b = FileUtils.read(new File(args[0]));
            torrentParser parser = new torrentParser();
            Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
-            Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false);
+            Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false, false);
            Map<String, Word> w = c.words();
            for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
        } catch (final IOException e) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2751,7 +2751,7 @@ public final class Switchboard extends serverSwitch {
                new Condenser(
                        in.documents[i], in.queueEntry.profile().indexText(),
                        in.queueEntry.profile().indexMedia(),
-                        LibraryProvider.dymLib, true);
+                        LibraryProvider.dymLib, true, this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt));

            // update image result list statistics
            // its good to do this concurrently here, because it needs a DNS lookup
@ -2853,7 +2853,8 @@ public final class Switchboard extends serverSwitch {
                searchEvent,
                sourceName,
                getConfigBool(SwitchboardConstants.DHT_ENABLED, false),
-                sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null);
+                this.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null,
+                this.getConfig("crawler.http.acceptLanguage", null));
        final RSSFeed feed =
            EventChannel.channels(queueEntry.initiator() == null
                ? EventChannel.PROXY
@ -3186,7 +3187,9 @@ public final class Switchboard extends serverSwitch {
                                if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
                                    throw new Parser.Failure("indexing is denied", url);
                                }
-                                final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
+                                final Condenser condenser = new Condenser(
+                                        document, true, true, LibraryProvider.dymLib, true, 
+                                        Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt));
                                ResultImages.registerImages(url, document, true);
                                Switchboard.this.webStructure.generateCitationReference(url, document);
                                storeDocumentIndex(
--- a/source/net/yacy/search/index/DocumentIndex.java
+++ b/source/net/yacy/search/index/DocumentIndex.java
@ -158,7 +158,7 @@ public class DocumentIndex extends Segment {
        int c = 0;
        for ( final Document document : documents ) {
        	if (document == null) continue;
-            final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
+            final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true, true);
            rows[c++] =
                super.storeDocument(
                    url,
@ -171,6 +171,7 @@ public class DocumentIndex extends Segment {
                    null,
                    DocumentIndex.class.getName() + ".add",
                    false,
+                    null,
                    null);
        }
        return rows;
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -761,7 +761,7 @@ public class Segment {
            }
            // get the word set
            Set<String> words = null;
-            words = new Condenser(document, true, true, null, false).words().keySet();
+            words = new Condenser(document, true, true, null, false, false).words().keySet();

            // delete all word references
            int count = 0;
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -42,6 +42,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.TreeSet;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicInteger;
@ -494,6 +495,34 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            long firstSeen = segment.getFirstSeenTime(digestURL.hash());
            if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
            add(doc, CollectionSchema.last_modified, lastModified);
+        }
+        if (allAttr ||
+                contains(CollectionSchema.dates_in_content_sxt) || contains(CollectionSchema.dates_in_content_count_i) ||
+                contains(CollectionSchema.date_in_content_min_dt) || contains(CollectionSchema.date_in_content_max_dt)) {
+            LinkedHashSet<Date> dates_in_content = condenser.dates_in_content;
+            if (allAttr || contains(CollectionSchema.dates_in_content_count_i)) {
+                add(doc, CollectionSchema.dates_in_content_count_i, dates_in_content.size());
+            }
+            if (dates_in_content.size() > 0) {
+                if (allAttr || contains(CollectionSchema.dates_in_content_sxt)) {
+                    String[] dates = new String[dates_in_content.size()];   
+                    int i = 0; for (Date d: dates_in_content) dates[i++] = org.apache.solr.schema.TrieDateField.formatExternal(d);
+                    add(doc, CollectionSchema.dates_in_content_sxt, dates);
+                }
+                // order the dates to get the oldest and youngest
+                TreeSet<Date> ordered_dates = new TreeSet<>();
+                ordered_dates.addAll(dates_in_content);
+                if (allAttr || contains(CollectionSchema.date_in_content_min_dt)) {
+                    Date date_in_content_min_dt = ordered_dates.iterator().next();
+                    add(doc, CollectionSchema.date_in_content_min_dt, date_in_content_min_dt);
+                }
+                if (allAttr || contains(CollectionSchema.date_in_content_max_dt)) {
+                    Date date_in_content_max_dt = ordered_dates.descendingIterator().next();
+                    add(doc, CollectionSchema.date_in_content_max_dt, date_in_content_max_dt);
+                }
+            }
+            
+            
        }
        if (allAttr || contains(CollectionSchema.keywords)) {
            String keywords = document.dc_subject(' ');
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@ -35,6 +35,10 @@ public enum CollectionSchema implements SchemaDeclaration {
    sku(SolrType.string, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
    //sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
    last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
+    dates_in_content_sxt(SolrType.string, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here in order of the appearances"),
+    dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"),
+    date_in_content_min_dt(SolrType.date, true, true, false, false, false, "if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates"),
+    date_in_content_max_dt(SolrType.date, true, true, false, false, false, "if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future"),
    content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
    http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"),
    www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),