You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/document/SentenceReader.java

205 lines
6.2 KiB

/**
* SentenceReader
* Copyright 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 09.02.2011 on http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Read sentences from a given text.
* This enumerates StringBuilder objects.
*/
public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> {
/** Holds the next element */
private StringBuilder buffer;
/** List of already parsed sentences, eventually in addition to those extracted from the main text. */
private List<StringBuilder> parsedSentences;
/** Current position in the parsedSentences list. */
private int sentencesPos;
/** The main text to parse for sentences */
private String text;
/** The current character position in the main text */
private int pos;
/** When true sentences can not include line break characters */
private boolean pre = false;
public SentenceReader(final String text) {
this(new ArrayList<>(), text, false);
}
public SentenceReader(final String text, final boolean pre) {
this(new ArrayList<>(), text, pre);
}
public SentenceReader(final List<StringBuilder> parsedSentences, final String text, final boolean pre) {
assert text != null;
this.text = text;
this.pos = 0;
this.pre = pre;
if(parsedSentences == null) {
this.parsedSentences = new ArrayList<>();
} else {
this.parsedSentences = parsedSentences;
}
this.sentencesPos = 0;
this.buffer = nextElement0();
}
public void pre(final boolean x) {
this.pre = x;
}
private StringBuilder nextElement0() {
if(this.sentencesPos < this.parsedSentences.size()) {
final StringBuilder element = this.parsedSentences.get(this.sentencesPos);
this.sentencesPos++;
return element;
}
final StringBuilder s = new StringBuilder(80);
int nextChar;
char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' '
// find sentence end
while (this.pos < this.text.length() && (nextChar = this.text.charAt(this.pos++)) > 0) {
c = (char) nextChar;
if (this.pre && (nextChar == 10 || nextChar == 13)) break;
if (c < ' ') c = ' ';
if (lc == ' ' && c == ' ') continue; // ignore double spaces
s.append(c);
if (punctuation(lc) && invisible(c)) break;
lc = c;
}
if (s.length() == 0) return null;
if (s.charAt(s.length() - 1) == ' ') {
s.trimToSize();
s.deleteCharAt(s.length() - 1);
}
/* Add to parsed sentences list for eventual reuse after a reset */
this.parsedSentences.add(s);
this.sentencesPos++;
return s;
}
public final static boolean invisible(final char c) {
// first check average simple case
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) return false;
// then check more complex case which applies to all character sets
final int type = Character.getType(c);
return !(type == Character.LOWERCASE_LETTER
|| type == Character.DECIMAL_DIGIT_NUMBER
|| type == Character.UPPERCASE_LETTER
|| type == Character.MODIFIER_LETTER
|| type == Character.OTHER_LETTER
|| type == Character.TITLECASE_LETTER
|| punctuation(c) || digitsep(c));
}
public final static boolean punctuation(final char c) {
switch (c) {
case '.':
case '!':
case '?':
return true;
default:
return false;
}
}
public final static boolean digitsep(final char c) {
switch (c) {
case '.':
case ',':
return true;
default:
return false;
}
}
@Override
public boolean hasNext() {
return this.buffer != null;
}
@Override
public StringBuilder next() {
if (this.buffer == null) {
return null;
}
final StringBuilder r = this.buffer;
this.buffer = nextElement0();
return r;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public Iterator<StringBuilder> iterator() {
return this;
}
/**
* Reset the iterator position to zero
*/
public void reset() {
/* Reset only the sentences position to reuse already parsed sentences */
this.sentencesPos = 0;
this.buffer = nextElement0();
}
public synchronized void close() {
this.text = null;
this.parsedSentences = null;
}
public static void main(String[] args) {
String s = "a b 1.5 ccc -4,7 d. so -o et, qu. 4.7Ohm 2.54inch.";
SentenceReader sr = new SentenceReader(s);
for (StringBuilder a: sr) System.out.println(a);
sr = new SentenceReader(s);
WordTokenizer words = new WordTokenizer(sr, null);
try {
while (words.hasMoreElements()) {
System.out.println(words.nextElement().toString());
}
} finally {
words.close();
words = null;
}
}
}