diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 95a27ecbe..aa477b2cf 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -18,16 +18,16 @@ sku ## last-modified from http header, date (mandatory field) last_modified -## if date expressions can be found in the content, these dates are listed here in order of the appearances"), +## if date expressions can be found in the content, these dates are listed here in order of the appearances #dates_in_content_sxt ## the number of entries in dates_in_content_sxt #dates_in_content_count_i -## if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates"), +## if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates #date_in_content_min_dt -## if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future"), +## if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future #date_in_content_max_dt ## mime-type of document, string (mandatory field) diff --git a/source/net/yacy/cora/date/GenericFormatter.java b/source/net/yacy/cora/date/GenericFormatter.java index c8d76d246..e824f383d 100644 --- a/source/net/yacy/cora/date/GenericFormatter.java +++ b/source/net/yacy/cora/date/GenericFormatter.java @@ -88,7 +88,7 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter * Note: The short day format doesn't include any timezone information. This method * transforms the date into the GMT/UTC timezone. Example: If the local system time is, * 2007-12-18 01:15:00 +0200, then the resulting String will be "2007-12-17". - * In case you need a format with a timezon offset, use {@link #formatShortDay(TimeZone)} + * In case you need a format with a timezone offset, use {@link #formatShortDay(TimeZone)} * @return a String representation of the current system date in GMT using the * short day format, e.g. "20071218". */ diff --git a/source/net/yacy/crawler/data/Transactions.java b/source/net/yacy/crawler/data/Transactions.java index 0bdbfbaf9..e98f7dd95 100644 --- a/source/net/yacy/crawler/data/Transactions.java +++ b/source/net/yacy/crawler/data/Transactions.java @@ -79,7 +79,7 @@ public class Transactions { archive = new Snapshots(archiveDir); } - public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent) { + public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { // GET METADATA FROM DOC final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); @@ -132,7 +132,7 @@ public class Transactions { public void run() { executorRunning.incrementAndGet(); try { - Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath); + Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); } catch (Throwable e) {} finally { executorRunning.decrementAndGet(); } @@ -140,7 +140,7 @@ public class Transactions { }; executor.execute(t); } else { - success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath); + success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); } } diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 0d49dffe3..1c2afa2a6 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -86,7 +86,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } //get words from document - final Map words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words(); + final Map words = new Condenser(document, true, true, LibraryProvider.dymLib, false, false).words(); // generate potential tags from document title, description and subject final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 54df12232..3c6b94e1a 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -25,6 +25,7 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -85,13 +86,15 @@ public final class Condenser { //public int RESULT_DIFF_SENTENCES = -1; public Bitfield RESULT_FLAGS = new Bitfield(4); private final Identificator languageIdentificator; - + public LinkedHashSet dates_in_content; + public Condenser( final Document document, final boolean indexText, final boolean indexMedia, final WordCache meaningLib, - final boolean doAutotagging + final boolean doAutotagging, + final boolean findDatesInContent ) { Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging // if addMedia == true, then all the media links are also parsed and added to the words @@ -99,7 +102,8 @@ public final class Condenser { this.words = new HashMap(); this.synonyms = new LinkedHashSet(); this.RESULT_FLAGS = new Bitfield(4); - + this.dates_in_content = new LinkedHashSet(); + // construct flag set for document ContentDomain contentDomain = document.getContentDomain(); if (contentDomain == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); @@ -115,7 +119,9 @@ public final class Condenser { Map.Entry entry; if (indexText) { - createCondensement(document.getTextString(), meaningLib, doAutotagging); + String text = document.getTextString(); + if (findDatesInContent) this.dates_in_content = DateDetection.parse(text); + createCondensement(text, meaningLib, doAutotagging); // the phrase counter: // phrase 0 are words taken from the URL // phrase 1 is the MainTitle diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index 7de52065c..25e385092 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -136,9 +136,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable { public void startElement(final String uri, final String name, String tag, final Attributes atts) throws SAXException { if (tag == null) return; tag = tag.toLowerCase(); - if ("record".equals(tag) || "document".equals(tag)) { + if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) { this.surrogate = new DCEntry(); - } else if ("element".equals(tag)) { + } else if ("element".equals(tag) || "str".equals(tag) || "int".equals(tag) || "bool".equals(tag) || "long".equals(tag)) { this.elementName = atts.getValue("name"); } else if ("value".equals(tag)) { this.buffer.setLength(0); @@ -154,7 +154,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { public void endElement(final String uri, final String name, String tag) { if (tag == null) return; tag = tag.toLowerCase(); - if ("record".equals(tag) || "document".equals(tag)) { + if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) { //System.out.println("A Title: " + this.surrogate.title()); try { this.surrogates.put(this.surrogate); @@ -169,6 +169,12 @@ public class SurrogateReader extends DefaultHandler implements Runnable { } else if ("element".equals(tag)) { this.buffer.setLength(0); this.parsingValue = false; + } else if ("str".equals(tag) || "int".equals(tag) || "bool".equals(tag) || "long".equals(tag)){ + final String value = buffer.toString().trim(); + if (this.elementName != null) { + this.surrogate.getMap().put(this.elementName, new String[]{value}); + } + this.parsingValue = false; } else if ("value".equals(tag)) { //System.out.println("BUFFER-SIZE=" + buffer.length()); final String value = buffer.toString().trim(); diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 08191a6dc..5691d14c6 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -120,7 +120,7 @@ public class torrentParser extends AbstractParser implements Parser { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); - Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false); + Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false, false); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); } catch (final IOException e) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 9e3a79027..b4a63e1ad 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2751,7 +2751,7 @@ public final class Switchboard extends serverSwitch { new Condenser( in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), - LibraryProvider.dymLib, true); + LibraryProvider.dymLib, true, this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt)); // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup @@ -2853,7 +2853,8 @@ public final class Switchboard extends serverSwitch { searchEvent, sourceName, getConfigBool(SwitchboardConstants.DHT_ENABLED, false), - sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null); + this.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, + this.getConfig("crawler.http.acceptLanguage", null)); final RSSFeed feed = EventChannel.channels(queueEntry.initiator() == null ? EventChannel.PROXY @@ -3186,7 +3187,9 @@ public final class Switchboard extends serverSwitch { if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) { throw new Parser.Failure("indexing is denied", url); } - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true); + final Condenser condenser = new Condenser( + document, true, true, LibraryProvider.dymLib, true, + Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt)); ResultImages.registerImages(url, document, true); Switchboard.this.webStructure.generateCitationReference(url, document); storeDocumentIndex( diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index fa1bf242f..29a61bad2 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -158,7 +158,7 @@ public class DocumentIndex extends Segment { int c = 0; for ( final Document document : documents ) { if (document == null) continue; - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true); + final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true, true); rows[c++] = super.storeDocument( url, @@ -171,6 +171,7 @@ public class DocumentIndex extends Segment { null, DocumentIndex.class.getName() + ".add", false, + null, null); } return rows; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index a5c8d59e5..7e8ff154f 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -761,7 +761,7 @@ public class Segment { } // get the word set Set words = null; - words = new Condenser(document, true, true, null, false).words().keySet(); + words = new Condenser(document, true, true, null, false, false).words().keySet(); // delete all word references int count = 0; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index ffc1ae77e..092c685f2 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -42,6 +42,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; +import java.util.TreeSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; @@ -494,6 +495,34 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri long firstSeen = segment.getFirstSeenTime(digestURL.hash()); if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier add(doc, CollectionSchema.last_modified, lastModified); + } + if (allAttr || + contains(CollectionSchema.dates_in_content_sxt) || contains(CollectionSchema.dates_in_content_count_i) || + contains(CollectionSchema.date_in_content_min_dt) || contains(CollectionSchema.date_in_content_max_dt)) { + LinkedHashSet dates_in_content = condenser.dates_in_content; + if (allAttr || contains(CollectionSchema.dates_in_content_count_i)) { + add(doc, CollectionSchema.dates_in_content_count_i, dates_in_content.size()); + } + if (dates_in_content.size() > 0) { + if (allAttr || contains(CollectionSchema.dates_in_content_sxt)) { + String[] dates = new String[dates_in_content.size()]; + int i = 0; for (Date d: dates_in_content) dates[i++] = org.apache.solr.schema.TrieDateField.formatExternal(d); + add(doc, CollectionSchema.dates_in_content_sxt, dates); + } + // order the dates to get the oldest and youngest + TreeSet ordered_dates = new TreeSet<>(); + ordered_dates.addAll(dates_in_content); + if (allAttr || contains(CollectionSchema.date_in_content_min_dt)) { + Date date_in_content_min_dt = ordered_dates.iterator().next(); + add(doc, CollectionSchema.date_in_content_min_dt, date_in_content_min_dt); + } + if (allAttr || contains(CollectionSchema.date_in_content_max_dt)) { + Date date_in_content_max_dt = ordered_dates.descendingIterator().next(); + add(doc, CollectionSchema.date_in_content_max_dt, date_in_content_max_dt); + } + } + + } if (allAttr || contains(CollectionSchema.keywords)) { String keywords = document.dc_subject(' '); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index aecd14bab..8c68799ef 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -35,6 +35,10 @@ public enum CollectionSchema implements SchemaDeclaration { sku(SolrType.string, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr. //sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr. last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), + dates_in_content_sxt(SolrType.string, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here in order of the appearances"), + dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"), + date_in_content_min_dt(SolrType.date, true, true, false, false, false, "if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates"), + date_in_content_max_dt(SolrType.date, true, true, false, false, false, "if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future"), content_type(SolrType.string, true, true, true, false, false, "mime-type of document"), http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"), www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),