From 5db97a892856265737f67978a5175c36c46e5cee Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 2 Sep 2023 19:15:22 +0200 Subject: [PATCH] parser can now separate numbers from words also when they are not separated by space, i.e. 4.7Ohm --- source/net/yacy/document/SentenceReader.java | 2 +- source/net/yacy/document/WordTokenizer.java | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java index e7012c68d..3806fc709 100644 --- a/source/net/yacy/document/SentenceReader.java +++ b/source/net/yacy/document/SentenceReader.java @@ -178,7 +178,7 @@ public class SentenceReader implements Iterator, Iterable { continue; // Continue to the next character without further checks. } + // Transition from digit (or digit separator) to a letter. Save the number as a token and start a new token for the word. + if (wasDigitSep && Character.isLetter(c)) { + if (sb.length() > 0) { + this.s.add(sb); + sb = new StringBuilder(20); + } + wasDigitSep = false; + } + // Check if the current character is a punctuation. // Punctuation checks are prioritized over invisibles due to simplicity and speed. if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible @@ -189,6 +198,11 @@ public class WordTokenizer implements Enumeration { // If the character is not punctuation or invisible, add it to the current token. else { sb = sb.append(c); + // Check for transition from number to word, e.g., "4.7Ohm" + if (i < r.length() - 1 && Character.isDigit(c) && Character.isLetter(r.charAt(i + 1))) { + this.s.add(sb); + sb = new StringBuilder(20); // Start capturing the word as a new token. + } } }