parser can now separate numbers from words also when they are not

separated by space, i.e. 4.7Ohm
pull/594/head
Michael Peter Christen 1 year ago
parent 079eafe7f1
commit 5db97a8928

@ -178,7 +178,7 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
}
public static void main(String[] args) {
String s = "a b 1.5 ccc 4,7 d. so o et, qu.";
String s = "a b 1.5 ccc 4,7 d. so o et, qu. 4.7Ohm 2.54inch.";
SentenceReader sr = new SentenceReader(s);
for (StringBuilder a: sr) System.out.println(a);
sr = new SentenceReader(s);

@ -162,6 +162,15 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
continue; // Continue to the next character without further checks.
}
// Transition from digit (or digit separator) to a letter. Save the number as a token and start a new token for the word.
if (wasDigitSep && Character.isLetter(c)) {
if (sb.length() > 0) {
this.s.add(sb);
sb = new StringBuilder(20);
}
wasDigitSep = false;
}
// Check if the current character is a punctuation.
// Punctuation checks are prioritized over invisibles due to simplicity and speed.
if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
@ -189,6 +198,11 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
// If the character is not punctuation or invisible, add it to the current token.
else {
sb = sb.append(c);
// Check for transition from number to word, e.g., "4.7Ohm"
if (i < r.length() - 1 && Character.isDigit(c) && Character.isLetter(r.charAt(i + 1))) {
this.s.add(sb);
sb = new StringBuilder(20); // Start capturing the word as a new token.
}
}
}

Loading…
Cancel
Save