fix for url camel case parser and sentence reader

pull/1/head
Michael Peter Christen 13 years ago
parent fbc1a2030d
commit 801972fe6f

@ -127,6 +127,8 @@ public final class Condenser {
this.languageIdentificator = new Identificator();
// add the URL components to the word list
insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
Map.Entry<MultiProtocolURI, String> entry;
if (indexText) {
@ -175,9 +177,6 @@ public final class Condenser {
this.RESULT_DIFF_SENTENCES = 0;
}
// add the URL components to the word list
insertTextToWords(new SentenceReader(document.dc_source().toNormalform(false, true)), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio

@ -26,7 +26,7 @@ package net.yacy.document;
import java.util.Iterator;
public class SentenceReader implements Iterator<StringBuilder> {
public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> {
// read sentences from a given input stream
// this enumerates StringBuilder objects
@ -66,15 +66,14 @@ public class SentenceReader implements Iterator<StringBuilder> {
// find sentence end
while (true) {
if (pos >= text.length()) return null;
nextChar = text.charAt(pos++);
if (this.pos >= this.text.length()) break;
nextChar = this.text.charAt(this.pos++);
//System.out.print((char) nextChar); // DEBUG
if (nextChar < 0) {
if (s.length() == 0) return null;
break;
}
c = (char) nextChar;
if (pre && (nextChar == 10 || nextChar == 13)) break;
if (this.pre && (nextChar == 10 || nextChar == 13)) break;
if (c < ' ') c = ' ';
if (lc == ' ' && c == ' ') continue; // ignore double spaces
s.append(c);
@ -82,7 +81,7 @@ public class SentenceReader implements Iterator<StringBuilder> {
lc = c;
}
if (s.length() == 0) return s;
if (s.length() == 0) return null;
if (s.charAt(s.length() - 1) == ' ') {
s.trimToSize();
s.deleteCharAt(s.length() - 1);
@ -108,24 +107,40 @@ public class SentenceReader implements Iterator<StringBuilder> {
return c == '.' || c == '!' || c == '?';
}
@Override
public boolean hasNext() {
return buffer != null;
return this.buffer != null;
}
@Override
public StringBuilder next() {
if (buffer == null) {
if (this.buffer == null) {
return null;
}
final StringBuilder r = buffer;
buffer = nextElement0();
final StringBuilder r = this.buffer;
this.buffer = nextElement0();
return r;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public Iterator<StringBuilder> iterator() {
return this;
}
public synchronized void close() {
text = null;
this.text = null;
}
public static void main(String[] args) {
String s = "a b ccc d";
SentenceReader sr = new SentenceReader(s);
for (StringBuilder a: sr) {
System.out.println(a);
}
}
}

Loading…
Cancel
Save