hack to make date detection faster (while it becomes a bit incomplete

regarding language alternatives)
pull/1/head
Michael Peter Christen 10 years ago
parent 6578ff3ddb
commit ee97302a23

@ -36,6 +36,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.util.ConcurrentLog;
/** /**
* The purpose of this class exceeds the demands on simple date parsing using a SimpleDateFormat * The purpose of this class exceeds the demands on simple date parsing using a SimpleDateFormat
@ -450,6 +451,7 @@ public class DateDetection {
public LinkedHashSet<Date> parse(final String text) { public LinkedHashSet<Date> parse(final String text) {
LinkedHashSet<Date> dates = new LinkedHashSet<>(); LinkedHashSet<Date> dates = new LinkedHashSet<>();
Matcher matcher = this.pattern.matcher(text); Matcher matcher = this.pattern.matcher(text);
//ConcurrentLog.info("DateDetection", "applying matcher: " + matcher.toString());
while (matcher.find()) { while (matcher.find()) {
if (!(matcher.groupCount() == 2)) continue; if (!(matcher.groupCount() == 2)) continue;
String entity1 = matcher.group(1); if (entity1 == null) continue; String entity1 = matcher.group(1); if (entity1 == null) continue;
@ -513,10 +515,12 @@ public class DateDetection {
private static LinkedHashSet<Date> parseRawDate(String text) { private static LinkedHashSet<Date> parseRawDate(String text) {
// get parse alternatives for different date styles; we consider that one document uses only one style // get parse alternatives for different date styles; we consider that one document uses only one style
LinkedHashSet<Date> DMYDates = EndianStyle.DMY.parse(text); LinkedHashSet<Date> DMYDates = EndianStyle.DMY.parse(text);
LinkedHashSet<Date> DMDates = ShortStyle.DM_GERMAN.parse(text); ShortStyle[] shortStyleCheck = new ShortStyle[]{ShortStyle.DM_GERMAN, ShortStyle.DM_FRENCH, ShortStyle.DM_ITALIAN, ShortStyle.DM_SPANISH};
DMDates.addAll(ShortStyle.DM_FRENCH.parse(text)); LinkedHashSet<Date> DMDates = new LinkedHashSet<>();
DMDates.addAll(ShortStyle.DM_ITALIAN.parse(text)); for (ShortStyle shortStyle: shortStyleCheck) {
DMDates.addAll(ShortStyle.DM_SPANISH.parse(text)); DMDates.addAll(shortStyle.parse(text));
if (DMDates.size() > 0) break;
}
DMYDates.addAll(DMDates); DMYDates.addAll(DMDates);
LinkedHashSet<Date> MDYDates = DMYDates.size() == 0 ? EndianStyle.MDY.parse(text) : new LinkedHashSet<Date>(0); LinkedHashSet<Date> MDYDates = DMYDates.size() == 0 ? EndianStyle.MDY.parse(text) : new LinkedHashSet<Date>(0);

Loading…
Cancel
Save