fix for url camel case parser and sentence reader

pull/1/head
Michael Peter Christen 13 years ago
parent fbc1a2030d
commit 801972fe6f

@ -127,6 +127,8 @@ public final class Condenser {
this.languageIdentificator = new Identificator(); this.languageIdentificator = new Identificator();
// add the URL components to the word list
insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
Map.Entry<MultiProtocolURI, String> entry; Map.Entry<MultiProtocolURI, String> entry;
if (indexText) { if (indexText) {
@ -175,9 +177,6 @@ public final class Condenser {
this.RESULT_DIFF_SENTENCES = 0; this.RESULT_DIFF_SENTENCES = 0;
} }
// add the URL components to the word list
insertTextToWords(new SentenceReader(document.dc_source().toNormalform(false, true)), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
if (indexMedia) { if (indexMedia) {
// add anchor descriptions: here, we also add the url components // add anchor descriptions: here, we also add the url components
// audio // audio
@ -311,7 +310,7 @@ public final class Condenser {
boolean comb_indexof = false, last_last = false, last_index = false; boolean comb_indexof = false, last_last = false, last_index = false;
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100); final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
if (LibraryProvider.autotagging.size() == 0) doAutotagging = false; if (LibraryProvider.autotagging.size() == 0) doAutotagging = false;
// read source // read source
final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib); final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
try { try {

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public * modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either * License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version. * version 2.1 of the License, or (at your option) any later version.
* *
* This library is distributed in the hope that it will be useful, * This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. * Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public License * You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt * along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
@ -26,10 +26,10 @@ package net.yacy.document;
import java.util.Iterator; import java.util.Iterator;
public class SentenceReader implements Iterator<StringBuilder> { public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> {
// read sentences from a given input stream // read sentences from a given input stream
// this enumerates StringBuilder objects // this enumerates StringBuilder objects
private StringBuilder buffer; private StringBuilder buffer;
private String text; private String text;
private int pos; private int pos;
@ -42,7 +42,7 @@ public class SentenceReader implements Iterator<StringBuilder> {
this.pre = false; this.pre = false;
this.buffer = nextElement0(); this.buffer = nextElement0();
} }
public SentenceReader(final String text, final boolean pre) { public SentenceReader(final String text, final boolean pre) {
this(text); this(text);
this.pre = pre; this.pre = pre;
@ -51,38 +51,37 @@ public class SentenceReader implements Iterator<StringBuilder> {
public void pre(final boolean x) { public void pre(final boolean x) {
this.pre = x; this.pre = x;
} }
private StringBuilder nextElement0() { private StringBuilder nextElement0() {
final StringBuilder s = readSentence(); final StringBuilder s = readSentence();
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG //System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) return null; if (s == null) return null;
return s; return s;
} }
private StringBuilder readSentence() { private StringBuilder readSentence() {
final StringBuilder s = new StringBuilder(80); final StringBuilder s = new StringBuilder(80);
int nextChar; int nextChar;
char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' ' char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' '
// find sentence end // find sentence end
while (true) { while (true) {
if (pos >= text.length()) return null; if (this.pos >= this.text.length()) break;
nextChar = text.charAt(pos++); nextChar = this.text.charAt(this.pos++);
//System.out.print((char) nextChar); // DEBUG //System.out.print((char) nextChar); // DEBUG
if (nextChar < 0) { if (nextChar < 0) {
if (s.length() == 0) return null;
break; break;
} }
c = (char) nextChar; c = (char) nextChar;
if (pre && (nextChar == 10 || nextChar == 13)) break; if (this.pre && (nextChar == 10 || nextChar == 13)) break;
if (c < ' ') c = ' '; if (c < ' ') c = ' ';
if (lc == ' ' && c == ' ') continue; // ignore double spaces if (lc == ' ' && c == ' ') continue; // ignore double spaces
s.append(c); s.append(c);
if (punctuation(lc) && invisible(c)) break; if (punctuation(lc) && invisible(c)) break;
lc = c; lc = c;
} }
if (s.length() == 0) return s; if (s.length() == 0) return null;
if (s.charAt(s.length() - 1) == ' ') { if (s.charAt(s.length() - 1) == ' ') {
s.trimToSize(); s.trimToSize();
s.deleteCharAt(s.length() - 1); s.deleteCharAt(s.length() - 1);
@ -103,29 +102,45 @@ public class SentenceReader implements Iterator<StringBuilder> {
|| type == Character.TITLECASE_LETTER || type == Character.TITLECASE_LETTER
|| punctuation(c)); || punctuation(c));
} }
public final static boolean punctuation(final char c) { public final static boolean punctuation(final char c) {
return c == '.' || c == '!' || c == '?'; return c == '.' || c == '!' || c == '?';
} }
@Override
public boolean hasNext() { public boolean hasNext() {
return buffer != null; return this.buffer != null;
} }
@Override
public StringBuilder next() { public StringBuilder next() {
if (buffer == null) { if (this.buffer == null) {
return null; return null;
} }
final StringBuilder r = buffer; final StringBuilder r = this.buffer;
buffer = nextElement0(); this.buffer = nextElement0();
return r; return r;
} }
@Override
public void remove() { public void remove() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public Iterator<StringBuilder> iterator() {
return this;
}
public synchronized void close() { public synchronized void close() {
text = null; this.text = null;
}
public static void main(String[] args) {
String s = "a b ccc d";
SentenceReader sr = new SentenceReader(s);
for (StringBuilder a: sr) {
System.out.println(a);
}
} }
} }

Loading…
Cancel
Save