From 0689f4f0ae2907c35c080ea15b5ad406ea9402e7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 3 Sep 2023 10:22:03 +0200 Subject: [PATCH] Check if the character is a minus sign and is followed by a letter or a digit. Treat it as part of the word/number. --- source/net/yacy/document/SentenceReader.java | 26 +++++++++++++------- source/net/yacy/document/WordTokenizer.java | 6 +++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java index 3806fc709..59b08d02f 100644 --- a/source/net/yacy/document/SentenceReader.java +++ b/source/net/yacy/document/SentenceReader.java @@ -27,11 +27,6 @@ package net.yacy.document; import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import java.util.SortedMap; -import java.util.TreeMap; - -import net.yacy.cora.order.Base64Order; -import net.yacy.kelondro.data.word.Word; /** * Read sentences from a given text. @@ -127,15 +122,28 @@ public class SentenceReader implements Iterator, Iterable, Iterable { for (int i = 0; i < r.length(); i++) { // tokenize one sentence c = r.charAt(i); + // Check if the character is a minus sign and is followed by a letter or a digit. Treat it as part of the word/number. + if (c == '-' && i < r.length() - 1 && (Character.isLetter(r.charAt(i + 1)) || Character.isDigit(r.charAt(i + 1)))) { + sb.append(c); + continue; // Skip further checks and continue to the next character. + } + // Check if the current character is a digit separator within a number. if (SentenceReader.digitsep(c) && i > 0 && Character.isDigit(r.charAt(i - 1)) && (i < r.length() - 1) && Character.isDigit(r.charAt(i + 1))) { sb.append(c); // Add the digit separator to the current token.