More SentenceReader cleanup

pull/1/head
orbiter 13 years ago
parent 586bb0eb6a
commit fc0f9543fe

@ -367,8 +367,7 @@ dc_rights
} }
public List<StringBuilder> getSentences(final boolean pre) { public List<StringBuilder> getSentences(final boolean pre) {
final SentenceReader sr = new SentenceReader(getTextString()); final SentenceReader sr = new SentenceReader(getTextString(), pre);
sr.pre(pre);
List<StringBuilder> sentences = new ArrayList<StringBuilder>(); List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) { while (sr.hasNext()) {
sentences.add(sr.next()); sentences.add(sr.next());

@ -33,16 +33,19 @@ public class SentenceReader implements Iterator<StringBuilder> {
private StringBuilder buffer; private StringBuilder buffer;
private String text; private String text;
private int pos; private int pos;
private int counter = 0;
private boolean pre = false; private boolean pre = false;
public SentenceReader(final String text) { public SentenceReader(final String text) {
assert text != null; assert text != null;
this.text = text; this.text = text;
this.pos = 0; this.pos = 0;
this.buffer = nextElement0();
this.counter = 0;
this.pre = false; this.pre = false;
this.buffer = nextElement0();
}
public SentenceReader(final String text, final boolean pre) {
this(text);
this.pre = pre;
} }
public void pre(final boolean x) { public void pre(final boolean x) {
@ -71,9 +74,9 @@ public class SentenceReader implements Iterator<StringBuilder> {
break; break;
} }
c = (char) nextChar; c = (char) nextChar;
if (pre && ((c == (char) 10) || (c == (char) 13))) break; if (pre && (nextChar == 10 || nextChar == 13)) break;
if (c < ' ') c = ' '; if (c < ' ') c = ' ';
if ((lc == ' ') && (c == ' ')) continue; // ignore double spaces if (lc == ' ' && c == ' ') continue; // ignore double spaces
s.append(c); s.append(c);
if (punctuation(lc) && invisible(c)) break; if (punctuation(lc) && invisible(c)) break;
lc = c; lc = c;
@ -88,7 +91,10 @@ public class SentenceReader implements Iterator<StringBuilder> {
} }
public final static boolean invisible(final char c) { public final static boolean invisible(final char c) {
final int type = Character.getType(c); // first check average simple case
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) return false;
// then check more complex case which applies to all character sets
final int type = Character.getType(c);
return !(type == Character.LOWERCASE_LETTER return !(type == Character.LOWERCASE_LETTER
|| type == Character.DECIMAL_DIGIT_NUMBER || type == Character.DECIMAL_DIGIT_NUMBER
|| type == Character.UPPERCASE_LETTER || type == Character.UPPERCASE_LETTER
@ -110,16 +116,11 @@ public class SentenceReader implements Iterator<StringBuilder> {
if (buffer == null) { if (buffer == null) {
return null; return null;
} }
counter = counter + buffer.length() + 1;
final StringBuilder r = buffer; final StringBuilder r = buffer;
buffer = nextElement0(); buffer = nextElement0();
return r; return r;
} }
public int count() {
return counter;
}
public void remove() { public void remove() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }

@ -182,8 +182,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// try the solr text first // try the solr text first
if (solrText != null) { if (solrText != null) {
// compute sentences from solr query // compute sentences from solr query
final SentenceReader sr = new SentenceReader(solrText); final SentenceReader sr = new SentenceReader(solrText, pre);
sr.pre(pre);
sentences = new ArrayList<StringBuilder>(); sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) { while (sr.hasNext()) {
sentences.add(sr.next()); sentences.add(sr.next());

Loading…
Cancel
Save