add CommonPattern for multiple spaces

to eliminate empty split words on following spaces
pull/9/head
reger 10 years ago
parent 90f75c8c3d
commit 821262a179

@ -449,7 +449,7 @@ public class ViewFile {
words = words.substring(1, words.length() - 1);
}
words = UTF8.decodeURL(words);
if (words.indexOf(' ',0) >= 0) return CommonPattern.SPACE.split(words);
if (words.indexOf(' ',0) >= 0) return CommonPattern.SPACES.split(words);
if (words.indexOf(',',0) >= 0) return CommonPattern.COMMA.split(words);
if (words.indexOf('+',0) >= 0) return words.split("\\+");
w = new String[1];

@ -164,7 +164,7 @@ public class Vocabulary_p {
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();
if (t.isEmpty()) continue;
if (discoverFromTitleSplitted) {
String[] ts = CommonPattern.SPACE.split(t);
String[] ts = CommonPattern.SPACES.split(t);
for (String s: ts) {
if (s.isEmpty()) continue;
if (s.endsWith(".jpg") || s.endsWith(".gif")) continue;

@ -70,7 +70,7 @@ public class yacysearch_location {
int placemarkCounter = 0;
if (query.length() > 0 && search_query) {
final Set<GeoLocation> locations = LibraryProvider.geoLoc.find(query, true);
for (final String qp: CommonPattern.SPACE.split(query)) {
for (final String qp: CommonPattern.SPACES.split(query)) {
locations.addAll(LibraryProvider.geoLoc.find(qp, true));
}
String ip = sb.peers.mySeed().getIP();

@ -34,7 +34,7 @@
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<!-- the Solr version used in dependency section for all related dependencies -->
<solr.version>4.10.3</solr.version>
<solr.version>5.2.1</solr.version>
<!-- the Jetty version used in dependency section for all related dependencies -->
<jetty.version>9.2.11.v20150529</jetty.version>

@ -199,7 +199,7 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
final String subject = Token.subject.valueFrom(this.map, "");
if (subject.indexOf(',') >= 0) return CommonPattern.COMMA.split(subject);
if (subject.indexOf(';') >= 0) return CommonPattern.SEMICOLON.split(subject);
return CommonPattern.SPACE.split(subject);
return CommonPattern.SPACES.split(subject);
}
@Override

@ -985,7 +985,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
while ((p = t.indexOf(" ",0)) >= 0) t = t.substring(0, p) + t.substring(p + 1);
// split the string into tokens and add all camel-case splitting
final String[] u = CommonPattern.SPACE.split(t);
final String[] u = CommonPattern.SPACES.split(t);
final Set<String> token = new LinkedHashSet<String>();
for (final String r: u) token.add(r);
for (final String r: u) token.addAll(parseCamelCase(r));

@ -39,6 +39,7 @@ import java.util.regex.Pattern;
public class CommonPattern {
public final static Pattern SPACE = Pattern.compile(" ");
public final static Pattern SPACES = Pattern.compile(" +"); // pattern for one or multiple spaces
public final static Pattern COMMA = Pattern.compile(",");
public final static Pattern SEMICOLON = Pattern.compile(";");
public final static Pattern DOUBLEPOINT = Pattern.compile(":");

@ -305,7 +305,7 @@ public class DidYouMean {
s = snippet + (afterSnippet.length() > 0 ? " " + afterSnippet : "");
for (int i = 0; i < s.length(); i++) {char c = s.charAt(i); if (c < 'A') s = s.replace(c, ' ');} // remove funny symbols
s = s.replaceAll("<b>", " ").replaceAll("</b>", " ").replaceAll(" ", " ").trim(); // wipe superfluous whitespace
String[] sx = CommonPattern.SPACE.split(s);
String[] sx = CommonPattern.SPACES.split(s);
StringBuilder sb = new StringBuilder(s.length());
for (String x: sx) if (x.length() > 1 && sb.length() < 28) sb.append(x).append(' '); else break;
s = sb.toString().trim();

@ -284,7 +284,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.title_chars_val, cv);
}
if (allAttr || contains(CollectionSchema.title_words_val)) {
Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACE.split(title).length)};
Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACES.split(title).length)};
add(doc, CollectionSchema.title_words_val, cv);
}
@ -297,7 +297,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.description_chars_val, description_exist ? new Integer[]{new Integer(description.length())} : new Integer[0]);
}
if (allAttr || contains(CollectionSchema.description_words_val)) {
add(doc, CollectionSchema.description_words_val, description_exist ? new Integer[]{new Integer(description.length() == 0 ? 0 : CommonPattern.SPACE.split(description).length)} : new Integer[0]);
add(doc, CollectionSchema.description_words_val, description_exist ? new Integer[]{new Integer(description.length() == 0 ? 0 : CommonPattern.SPACES.split(description).length)} : new Integer[0]);
}
String keywords = md.dc_subject();
@ -460,7 +460,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if (allAttr || contains(CollectionSchema.title_words_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(titles.size());
for (String s: titles) cv.add(new Integer(CommonPattern.SPACE.split(s).length));
for (String s: titles) cv.add(new Integer(CommonPattern.SPACES.split(s).length));
add(doc, CollectionSchema.title_words_val, cv);
}
@ -479,7 +479,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if (allAttr || contains(CollectionSchema.description_words_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.length);
for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACE.split(s).length));
for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACES.split(s).length));
add(doc, CollectionSchema.description_words_val, cv);
}

Loading…
Cancel
Save