parser can now separate numbers from words also when they are not

separated by space, i.e. 4.7Ohm
2 years ago · 5db97a8928
parent 079eafe7f1
commit 5db97a8928
2 changed files with 15 additions and 1 deletions
--- a/source/net/yacy/document/SentenceReader.java
+++ b/source/net/yacy/document/SentenceReader.java
@ -178,7 +178,7 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
    }

    public static void main(String[] args) {
-        String s = "a b 1.5 ccc 4,7 d. so o et, qu.";
+        String s = "a b 1.5 ccc 4,7 d. so o et, qu. 4.7Ohm 2.54inch.";
        SentenceReader sr = new SentenceReader(s);
        for (StringBuilder a: sr) System.out.println(a);
        sr = new SentenceReader(s);
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@ -162,6 +162,15 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
                        continue; // Continue to the next character without further checks.
                    }

+                    // Transition from digit (or digit separator) to a letter. Save the number as a token and start a new token for the word.
+                    if (wasDigitSep && Character.isLetter(c)) {
+                        if (sb.length() > 0) {
+                            this.s.add(sb);
+                            sb = new StringBuilder(20);
+                        }
+                        wasDigitSep = false;
+                    }
+
                    // Check if the current character is a punctuation.
                    // Punctuation checks are prioritized over invisibles due to simplicity and speed.
                    if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
@ -189,6 +198,11 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
                    // If the character is not punctuation or invisible, add it to the current token.
                    else {
                        sb = sb.append(c);
+                        // Check for transition from number to word, e.g., "4.7Ohm"
+                        if (i < r.length() - 1 && Character.isDigit(c) && Character.isLetter(r.charAt(i + 1))) {
+                            this.s.add(sb);
+                            sb = new StringBuilder(20); // Start capturing the word as a new token.
+                        }
                    }
                }