parser can now separate numbers from words also when they are not

separated by space, i.e. 4.7Ohm
pull/594/head
Michael Peter Christen 1 year ago
parent 079eafe7f1
commit 5db97a8928

@ -178,7 +178,7 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
} }
public static void main(String[] args) { public static void main(String[] args) {
String s = "a b 1.5 ccc 4,7 d. so o et, qu."; String s = "a b 1.5 ccc 4,7 d. so o et, qu. 4.7Ohm 2.54inch.";
SentenceReader sr = new SentenceReader(s); SentenceReader sr = new SentenceReader(s);
for (StringBuilder a: sr) System.out.println(a); for (StringBuilder a: sr) System.out.println(a);
sr = new SentenceReader(s); sr = new SentenceReader(s);

@ -162,6 +162,15 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
continue; // Continue to the next character without further checks. continue; // Continue to the next character without further checks.
} }
// Transition from digit (or digit separator) to a letter. Save the number as a token and start a new token for the word.
if (wasDigitSep && Character.isLetter(c)) {
if (sb.length() > 0) {
this.s.add(sb);
sb = new StringBuilder(20);
}
wasDigitSep = false;
}
// Check if the current character is a punctuation. // Check if the current character is a punctuation.
// Punctuation checks are prioritized over invisibles due to simplicity and speed. // Punctuation checks are prioritized over invisibles due to simplicity and speed.
if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
@ -189,6 +198,11 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
// If the character is not punctuation or invisible, add it to the current token. // If the character is not punctuation or invisible, add it to the current token.
else { else {
sb = sb.append(c); sb = sb.append(c);
// Check for transition from number to word, e.g., "4.7Ohm"
if (i < r.length() - 1 && Character.isDigit(c) && Character.isLetter(r.charAt(i + 1))) {
this.s.add(sb);
sb = new StringBuilder(20); // Start capturing the word as a new token.
}
} }
} }

Loading…
Cancel
Save