Added and integrated new date detection class which can identify date

notions within the fulltext of a document. This class attempts to
identify also dates given abbreviated or with missing year or described
with names for special days, like 'Halloween'. In case that a date has
no year given, the current year and following years are considered.

This process is therefore able to identify a large set of dates to a
document, either because there are several dates given in the document
or the date is ambiguous. Four new Solr fields are used to store the
parsing result:

dates_in_content_sxt:
if date expressions can be found in the content, these dates are listed
here in order of the appearances

dates_in_content_count_i:
the number of entries in dates_in_content_sxt

date_in_content_min_dt:
if dates_in_content_sxt is filled, this contains the oldest date from
the list of available dates

#date_in_content_max_dt:
if dates_in_content_sxt is filled, this contains the youngest date from
the list of available dates, that may also be possibly in the future

These fields are deactiviated by default because the evaluation of
regular expressions to detect the date is yet too CPU intensive. Maybe
future enhancements will cause that this is switched on by default.

The purpose of these fields is the creation of calendar-like search
facets, to be implemented next.
pull/1/head
Michael Peter Christen 10 years ago
parent c3c2b6999b
commit 66b5a56976

@ -18,16 +18,16 @@ sku
## last-modified from http header, date (mandatory field)
last_modified
## if date expressions can be found in the content, these dates are listed here in order of the appearances"),
## if date expressions can be found in the content, these dates are listed here in order of the appearances
#dates_in_content_sxt
## the number of entries in dates_in_content_sxt
#dates_in_content_count_i
## if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates"),
## if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates
#date_in_content_min_dt
## if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future"),
## if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future
#date_in_content_max_dt
## mime-type of document, string (mandatory field)

@ -88,7 +88,7 @@ public class GenericFormatter extends AbstractFormatter implements DateFormatter
* Note: The short day format doesn't include any timezone information. This method
* transforms the date into the GMT/UTC timezone. Example: If the local system time is,
* 2007-12-18 01:15:00 +0200, then the resulting String will be "2007-12-17".
* In case you need a format with a timezon offset, use {@link #formatShortDay(TimeZone)}
* In case you need a format with a timezone offset, use {@link #formatShortDay(TimeZone)}
* @return a String representation of the current system date in GMT using the
* short day format, e.g. "20071218".
*/

@ -79,7 +79,7 @@ public class Transactions {
archive = new Snapshots(archiveDir);
}
public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent) {
public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {
// GET METADATA FROM DOC
final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@ -132,7 +132,7 @@ public class Transactions {
public void run() {
executorRunning.incrementAndGet();
try {
Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath);
Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath);
} catch (Throwable e) {} finally {
executorRunning.decrementAndGet();
}
@ -140,7 +140,7 @@ public class Transactions {
};
executor.execute(t);
} else {
success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath);
success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath);
}
}

@ -86,7 +86,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
}
//get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words();
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false, false).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;

@ -25,6 +25,7 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -85,13 +86,15 @@ public final class Condenser {
//public int RESULT_DIFF_SENTENCES = -1;
public Bitfield RESULT_FLAGS = new Bitfield(4);
private final Identificator languageIdentificator;
public LinkedHashSet<Date> dates_in_content;
public Condenser(
final Document document,
final boolean indexText,
final boolean indexMedia,
final WordCache meaningLib,
final boolean doAutotagging
final boolean doAutotagging,
final boolean findDatesInContent
) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words
@ -99,6 +102,7 @@ public final class Condenser {
this.words = new HashMap<String, Word>();
this.synonyms = new LinkedHashSet<String>();
this.RESULT_FLAGS = new Bitfield(4);
this.dates_in_content = new LinkedHashSet<Date>();
// construct flag set for document
ContentDomain contentDomain = document.getContentDomain();
@ -115,7 +119,9 @@ public final class Condenser {
Map.Entry<AnchorURL, String> entry;
if (indexText) {
createCondensement(document.getTextString(), meaningLib, doAutotagging);
String text = document.getTextString();
if (findDatesInContent) this.dates_in_content = DateDetection.parse(text);
createCondensement(text, meaningLib, doAutotagging);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainTitle

@ -136,9 +136,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
public void startElement(final String uri, final String name, String tag, final Attributes atts) throws SAXException {
if (tag == null) return;
tag = tag.toLowerCase();
if ("record".equals(tag) || "document".equals(tag)) {
if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) {
this.surrogate = new DCEntry();
} else if ("element".equals(tag)) {
} else if ("element".equals(tag) || "str".equals(tag) || "int".equals(tag) || "bool".equals(tag) || "long".equals(tag)) {
this.elementName = atts.getValue("name");
} else if ("value".equals(tag)) {
this.buffer.setLength(0);
@ -154,7 +154,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
public void endElement(final String uri, final String name, String tag) {
if (tag == null) return;
tag = tag.toLowerCase();
if ("record".equals(tag) || "document".equals(tag)) {
if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) {
//System.out.println("A Title: " + this.surrogate.title());
try {
this.surrogates.put(this.surrogate);
@ -169,6 +169,12 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
} else if ("element".equals(tag)) {
this.buffer.setLength(0);
this.parsingValue = false;
} else if ("str".equals(tag) || "int".equals(tag) || "bool".equals(tag) || "long".equals(tag)){
final String value = buffer.toString().trim();
if (this.elementName != null) {
this.surrogate.getMap().put(this.elementName, new String[]{value});
}
this.parsingValue = false;
} else if ("value".equals(tag)) {
//System.out.println("BUFFER-SIZE=" + buffer.length());
final String value = buffer.toString().trim();

@ -120,7 +120,7 @@ public class torrentParser extends AbstractParser implements Parser {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false);
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false, false);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (final IOException e) {

@ -2751,7 +2751,7 @@ public final class Switchboard extends serverSwitch {
new Condenser(
in.documents[i], in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, true);
LibraryProvider.dymLib, true, this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt));
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
@ -2853,7 +2853,8 @@ public final class Switchboard extends serverSwitch {
searchEvent,
sourceName,
getConfigBool(SwitchboardConstants.DHT_ENABLED, false),
sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null);
this.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null,
this.getConfig("crawler.http.acceptLanguage", null));
final RSSFeed feed =
EventChannel.channels(queueEntry.initiator() == null
? EventChannel.PROXY
@ -3186,7 +3187,9 @@ public final class Switchboard extends serverSwitch {
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url);
}
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
final Condenser condenser = new Condenser(
document, true, true, LibraryProvider.dymLib, true,
Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt));
ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document);
storeDocumentIndex(

@ -158,7 +158,7 @@ public class DocumentIndex extends Segment {
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true, true);
rows[c++] =
super.storeDocument(
url,
@ -171,6 +171,7 @@ public class DocumentIndex extends Segment {
null,
DocumentIndex.class.getName() + ".add",
false,
null,
null);
}
return rows;

@ -761,7 +761,7 @@ public class Segment {
}
// get the word set
Set<String> words = null;
words = new Condenser(document, true, true, null, false).words().keySet();
words = new Condenser(document, true, true, null, false, false).words().keySet();
// delete all word references
int count = 0;

@ -42,6 +42,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
@ -494,6 +495,34 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr ||
contains(CollectionSchema.dates_in_content_sxt) || contains(CollectionSchema.dates_in_content_count_i) ||
contains(CollectionSchema.date_in_content_min_dt) || contains(CollectionSchema.date_in_content_max_dt)) {
LinkedHashSet<Date> dates_in_content = condenser.dates_in_content;
if (allAttr || contains(CollectionSchema.dates_in_content_count_i)) {
add(doc, CollectionSchema.dates_in_content_count_i, dates_in_content.size());
}
if (dates_in_content.size() > 0) {
if (allAttr || contains(CollectionSchema.dates_in_content_sxt)) {
String[] dates = new String[dates_in_content.size()];
int i = 0; for (Date d: dates_in_content) dates[i++] = org.apache.solr.schema.TrieDateField.formatExternal(d);
add(doc, CollectionSchema.dates_in_content_sxt, dates);
}
// order the dates to get the oldest and youngest
TreeSet<Date> ordered_dates = new TreeSet<>();
ordered_dates.addAll(dates_in_content);
if (allAttr || contains(CollectionSchema.date_in_content_min_dt)) {
Date date_in_content_min_dt = ordered_dates.iterator().next();
add(doc, CollectionSchema.date_in_content_min_dt, date_in_content_min_dt);
}
if (allAttr || contains(CollectionSchema.date_in_content_max_dt)) {
Date date_in_content_max_dt = ordered_dates.descendingIterator().next();
add(doc, CollectionSchema.date_in_content_max_dt, date_in_content_max_dt);
}
}
}
if (allAttr || contains(CollectionSchema.keywords)) {
String keywords = document.dc_subject(' ');

@ -35,6 +35,10 @@ public enum CollectionSchema implements SchemaDeclaration {
sku(SolrType.string, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
//sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
dates_in_content_sxt(SolrType.string, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here in order of the appearances"),
dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"),
date_in_content_min_dt(SolrType.date, true, true, false, false, false, "if dates_in_content_sxt is filled, this contains the oldest date from the list of available dates"),
date_in_content_max_dt(SolrType.date, true, true, false, false, false, "if dates_in_content_sxt is filled, this contains the youngest date from the list of available dates, that may also be possibly in the future"),
content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"),
www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),

Loading…
Cancel
Save