|
|
@ -26,7 +26,7 @@ package net.yacy.document;
|
|
|
|
|
|
|
|
|
|
|
|
import java.util.Iterator;
|
|
|
|
import java.util.Iterator;
|
|
|
|
|
|
|
|
|
|
|
|
public class SentenceReader implements Iterator<StringBuilder> {
|
|
|
|
public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> {
|
|
|
|
// read sentences from a given input stream
|
|
|
|
// read sentences from a given input stream
|
|
|
|
// this enumerates StringBuilder objects
|
|
|
|
// this enumerates StringBuilder objects
|
|
|
|
|
|
|
|
|
|
|
@ -66,15 +66,14 @@ public class SentenceReader implements Iterator<StringBuilder> {
|
|
|
|
|
|
|
|
|
|
|
|
// find sentence end
|
|
|
|
// find sentence end
|
|
|
|
while (true) {
|
|
|
|
while (true) {
|
|
|
|
if (pos >= text.length()) return null;
|
|
|
|
if (this.pos >= this.text.length()) break;
|
|
|
|
nextChar = text.charAt(pos++);
|
|
|
|
nextChar = this.text.charAt(this.pos++);
|
|
|
|
//System.out.print((char) nextChar); // DEBUG
|
|
|
|
//System.out.print((char) nextChar); // DEBUG
|
|
|
|
if (nextChar < 0) {
|
|
|
|
if (nextChar < 0) {
|
|
|
|
if (s.length() == 0) return null;
|
|
|
|
|
|
|
|
break;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
c = (char) nextChar;
|
|
|
|
c = (char) nextChar;
|
|
|
|
if (pre && (nextChar == 10 || nextChar == 13)) break;
|
|
|
|
if (this.pre && (nextChar == 10 || nextChar == 13)) break;
|
|
|
|
if (c < ' ') c = ' ';
|
|
|
|
if (c < ' ') c = ' ';
|
|
|
|
if (lc == ' ' && c == ' ') continue; // ignore double spaces
|
|
|
|
if (lc == ' ' && c == ' ') continue; // ignore double spaces
|
|
|
|
s.append(c);
|
|
|
|
s.append(c);
|
|
|
@ -82,7 +81,7 @@ public class SentenceReader implements Iterator<StringBuilder> {
|
|
|
|
lc = c;
|
|
|
|
lc = c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (s.length() == 0) return s;
|
|
|
|
if (s.length() == 0) return null;
|
|
|
|
if (s.charAt(s.length() - 1) == ' ') {
|
|
|
|
if (s.charAt(s.length() - 1) == ' ') {
|
|
|
|
s.trimToSize();
|
|
|
|
s.trimToSize();
|
|
|
|
s.deleteCharAt(s.length() - 1);
|
|
|
|
s.deleteCharAt(s.length() - 1);
|
|
|
@ -108,24 +107,40 @@ public class SentenceReader implements Iterator<StringBuilder> {
|
|
|
|
return c == '.' || c == '!' || c == '?';
|
|
|
|
return c == '.' || c == '!' || c == '?';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public boolean hasNext() {
|
|
|
|
public boolean hasNext() {
|
|
|
|
return buffer != null;
|
|
|
|
return this.buffer != null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public StringBuilder next() {
|
|
|
|
public StringBuilder next() {
|
|
|
|
if (buffer == null) {
|
|
|
|
if (this.buffer == null) {
|
|
|
|
return null;
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
final StringBuilder r = buffer;
|
|
|
|
final StringBuilder r = this.buffer;
|
|
|
|
buffer = nextElement0();
|
|
|
|
this.buffer = nextElement0();
|
|
|
|
return r;
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void remove() {
|
|
|
|
public void remove() {
|
|
|
|
throw new UnsupportedOperationException();
|
|
|
|
throw new UnsupportedOperationException();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
|
|
|
public Iterator<StringBuilder> iterator() {
|
|
|
|
|
|
|
|
return this;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public synchronized void close() {
|
|
|
|
public synchronized void close() {
|
|
|
|
text = null;
|
|
|
|
this.text = null;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
|
|
|
|
|
String s = "a b ccc d";
|
|
|
|
|
|
|
|
SentenceReader sr = new SentenceReader(s);
|
|
|
|
|
|
|
|
for (StringBuilder a: sr) {
|
|
|
|
|
|
|
|
System.out.println(a);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|