replaced String with StringBuilder in suggestion process

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8020 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent d871812621
commit 0d858d48ec

@ -282,13 +282,14 @@ public class ViewFile {
boolean dark = true;
int i = 0;
String sentence, token;
String sentence;
StringBuilder token;
if (sentences != null) {
// Search word highlighting
for (final StringBuilder s: sentences) {
sentence = s.toString();
Enumeration<String> tokens = null;
Enumeration<StringBuilder> tokens = null;
tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
while (tokens.hasMoreElements()) {
token = tokens.nextElement();

@ -31,7 +31,6 @@ import net.yacy.kelondro.data.word.Word;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segments;
import de.anomic.data.DidYouMean;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -43,7 +42,7 @@ import de.anomic.server.servletProperties;
* http://www.opensearch.org/Specifications/OpenSearch/Extensions/Suggestions/1.1
* or
* https://wiki.mozilla.org/Search_Service/Suggestions
*
*
* for xml format:
* see Microsoft Search Suggestion Format
* http://msdn.microsoft.com/en-us/library/cc848863%28VS.85%29.aspx
@ -51,9 +50,9 @@ import de.anomic.server.servletProperties;
* http://msdn.microsoft.com/en-us/library/cc848862%28v=VS.85%29.aspx
*/
public class suggest {
private static final int meanMax = 30;
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final servletProperties prop = new servletProperties();
@ -62,17 +61,17 @@ public class suggest {
final boolean json = ext.equals("json");
final boolean xml = ext.equals("xml");
final boolean more = post != null && post.containsKey("more");
// get query
final String originalquerystring = (post == null) ? "" : post.get("query", post.get("q", "")).trim();
final String querystring = originalquerystring.replace('+', ' ');
final int timeout = (post == null) ? 300 : post.getInt("timeout", 300);
final int count = (post == null) ? 20 : post.getInt("count", 20);
// get segment
final Segment indexSegment;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
final String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
} else {
@ -83,18 +82,18 @@ public class suggest {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
int c = 0;
if (more ||
(indexSegment != null &&
!indexSegment.termIndex().has(Word.word2hash(querystring))))
{
final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), querystring);
final Iterator<String> meanIt = didYouMean.getSuggestions(timeout, count).iterator();
final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring));
final Iterator<StringBuilder> meanIt = didYouMean.getSuggestions(timeout, count).iterator();
String suggestion;
//[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]]
while (c < meanMax && meanIt.hasNext()) {
suggestion = meanIt.next();
suggestion = meanIt.next().toString();
if (json) {
prop.putJSON("suggestions_" + c + "_text", suggestion);
} else if (xml) {
@ -106,7 +105,7 @@ public class suggest {
c++;
}
}
if (c > 0) {
prop.put("suggestions_" + (c - 1) + "_eol", 1);
}
@ -125,9 +124,9 @@ public class suggest {
outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*");
prop.setOutgoingHeader(outgoingHeader);
}
// return rewrite properties
return prop;
}
}

@ -656,12 +656,12 @@ public class yacysearch {
prop.put("meanCount", meanMax);
if (meanMax > 0 && !json && !rss) {
final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), querystring);
final Iterator<String> meanIt = didYouMean.getSuggestions(100, 5).iterator();
final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring));
final Iterator<StringBuilder> meanIt = didYouMean.getSuggestions(100, 5).iterator();
int meanCount = 0;
String suggestion;
while( meanCount<meanMax && meanIt.hasNext()) {
suggestion = meanIt.next();
suggestion = meanIt.next().toString();
prop.put("didYouMean_suggestions_"+meanCount+"_word", suggestion);
prop.put("didYouMean_suggestions_"+meanCount+"_url",
QueryParams.navurl("html", 0, theQuery, suggestion, originalUrlMask.toString(), theQuery.navigators).toString()

@ -10,6 +10,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.ranking.ClusteredScoreMap;
import net.yacy.cora.ranking.ReversibleScoreMap;
import net.yacy.document.LibraryProvider;
import net.yacy.document.StringBuilderComparator;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
@ -17,7 +18,7 @@ import net.yacy.kelondro.rwi.IndexCell;
/**
* People make mistakes when they type words.
* People make mistakes when they type words.
* The most common mistakes are the four categories listed below:
* <ol>
* <li>Changing one letter: bat / cat;</li>
@ -35,7 +36,7 @@ public class DidYouMean {
private static final int MinimumInputWordLength = 2;
private static final int MinimumOutputWordLength = 4;
private static final char[] ALPHABET_LATIN = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z',
@ -52,40 +53,40 @@ public class DidYouMean {
}
private static final char[][] ALPHABETS = {ALPHABET_LATIN, ALPHABET_KANJI};
private static char[] alphabet = ALPHABET_LATIN;
private static final String POISON_STRING = "\n";
private static final StringBuilder POISON_STRING = new StringBuilder("\n");
public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors();
private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();
private final IndexCell<WordReference> index;
private final String word;
private final StringBuilder word;
private final int wordLen;
private final LinkedBlockingQueue<String> guessGen, guessLib;
private final LinkedBlockingQueue<StringBuilder> guessGen, guessLib;
private long timeLimit;
private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written
private final SortedSet<String> resultSet;
private final SortedSet<StringBuilder> resultSet;
private final indexSizeComparator INDEX_SIZE_COMPARATOR;
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/
public DidYouMean(final IndexCell<WordReference> index, final String word0) {
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR)));
this.word = word0.toLowerCase();
this.wordLen = word.length();
public DidYouMean(final IndexCell<WordReference> index, final StringBuilder word0) {
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR)));
this.word = word0;
this.wordLen = this.word.length();
this.index = index;
this.guessGen = new LinkedBlockingQueue<String>();
this.guessLib = new LinkedBlockingQueue<String>();
this.guessGen = new LinkedBlockingQueue<StringBuilder>();
this.guessLib = new LinkedBlockingQueue<StringBuilder>();
this.createGen = true;
this.INDEX_SIZE_COMPARATOR = new indexSizeComparator();
// identify language
if (this.word.length() > 0) {
char testchar = this.word.charAt(0);
final char testchar = this.word.charAt(0);
boolean alphafound = false;
alphatest: for (char[] alpha: ALPHABETS) {
alphatest: for (final char[] alpha: ALPHABETS) {
if (isAlphabet(alpha, testchar)) {
alphabet = alpha;
alphafound = true;
@ -94,8 +95,8 @@ public class DidYouMean {
}
if (!alphafound) {
// generate generic alphabet using simply a character block of 256 characters
char firstchar = (char) ((0xff & (testchar / 256)) * 256);
char lastchar = (char) (firstchar + 255);
final char firstchar = (char) ((0xff & (testchar / 256)) * 256);
final char lastchar = (char) (firstchar + 255);
alphabet = new char[256];
for (char a = firstchar; a <= lastchar; a++) {
alphabet[0xff & (a - firstchar)] = a;
@ -103,18 +104,18 @@ public class DidYouMean {
}
}
}
private static final boolean isAlphabet(final char[] alpha, final char testchar) {
for (final char a: alpha) if (a == testchar) return true;
return false;
}
public void reset() {
this.resultSet.clear();
this.guessGen.clear();
this.guessLib.clear();
}
/**
* get suggestions for a given word. The result is first ordered using a term size ordering,
* and a subset of the result is sorted again with a IO-intensive order based on the index size
@ -123,29 +124,29 @@ public class DidYouMean {
* @param preSortSelection the number of words that participate in the IO-intensive sort
* @return
*/
public SortedSet<String> getSuggestions(final long timeout, final int preSortSelection) {
public SortedSet<StringBuilder> getSuggestions(final long timeout, final int preSortSelection) {
if (this.word.length() < MinimumInputWordLength) return this.resultSet; // return nothing if input is too short
final long startTime = System.currentTimeMillis();
final long timelimit = startTime + timeout;
if (this.word.indexOf(' ') > 0) return getSuggestions(this.word.split(" "), timeout, preSortSelection, this.index);
final SortedSet<String> preSorted = getSuggestions(timeout);
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.indexOf(this.word, ' ') > 0) return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.index);
final SortedSet<StringBuilder> preSorted = getSuggestions(timeout);
if (System.currentTimeMillis() > timelimit) {
Log.logInfo("DidYouMean", "found and returned " + preSorted.size() + " unsorted suggestions (1); execution time: "
+ (System.currentTimeMillis() - startTime) + "ms");
return preSorted;
}
final ReversibleScoreMap<String> scored = new ClusteredScoreMap<String>();
for (final String s: preSorted) {
final ReversibleScoreMap<StringBuilder> scored = new ClusteredScoreMap<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
for (final StringBuilder s: preSorted) {
if (System.currentTimeMillis() > timelimit) break;
if (!(scored.sizeSmaller(2 * preSortSelection))) break;
scored.inc(s, index.count(Word.word2hash(s)));
scored.inc(s, this.index.count(Word.word2hash(s)));
}
final SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
final int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this
final SortedSet<StringBuilder> countSorted = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
final int wc = this.index.count(Word.word2hash(this.word)); // all counts must be greater than this
while (!scored.isEmpty() && countSorted.size() < preSortSelection) {
final String s = scored.getMaxKey();
int score = scored.delete(s);
final StringBuilder s = scored.getMaxKey();
final int score = scored.delete(s);
if (s.length() >= MinimumOutputWordLength && score > wc) countSorted.add(s);
if (System.currentTimeMillis() > timelimit) break;
}
@ -161,7 +162,7 @@ public class DidYouMean {
return countSorted;
}
/**
* return a string that is a suggestion list for the list of given words
* @param words
@ -170,13 +171,13 @@ public class DidYouMean {
* @return
*/
@SuppressWarnings("unchecked")
private static SortedSet<String> getSuggestions(final String[] words, final long timeout, final int preSortSelection, final IndexCell<WordReference> index) {
final SortedSet<String>[] s = new SortedSet[words.length];
private static SortedSet<StringBuilder> getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final IndexCell<WordReference> index) {
final SortedSet<StringBuilder>[] s = new SortedSet[words.length];
for (int i = 0; i < words.length; i++) {
s[i] = new DidYouMean(index, words[i]).getSuggestions(timeout / words.length, preSortSelection);
}
// make all permutations
final SortedSet<String> result = new TreeSet<String>();
final SortedSet<StringBuilder> result = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
StringBuilder sb;
for (int i = 0; i < words.length; i++) {
if (s[i].isEmpty()) continue;
@ -185,21 +186,21 @@ public class DidYouMean {
if (j > 0) sb.append(' ');
if (i == j) sb.append(s[j].first()); else sb.append(words[j]);
}
result.add(sb.toString());
result.add(sb);
}
return result;
}
/**
* This method triggers the producer and consumer threads of the DidYouMean object.
* @param word a String with a single word
* @param timeout execution time in ms.
* @return a Set&lt;String&gt; with word variations contained in term index.
*/
private SortedSet<String> getSuggestions(final long timeout) {
long startTime = System.currentTimeMillis();
private SortedSet<StringBuilder> getSuggestions(final long timeout) {
final long startTime = System.currentTimeMillis();
this.timeLimit = startTime + timeout;
// create one consumer thread that checks the guessLib queue
// for occurrences in the index. If the producers are started next, their
// results can be consumers directly
@ -208,14 +209,14 @@ public class DidYouMean {
consumers[0].start();
// get a single recommendation for the word without altering the word
Set<String> libr = LibraryProvider.dymLib.recommend(this.word);
for (final String t: libr) {
final Set<StringBuilder> libr = LibraryProvider.dymLib.recommend(this.word);
for (final StringBuilder t: libr) {
if (!t.equals(this.word)) try {
createGen = false;
guessLib.put(t);
} catch (InterruptedException e) {}
this.createGen = false;
this.guessLib.put(t);
} catch (final InterruptedException e) {}
}
// create and start producers
// the CPU load to create the guessed words is very low, but the testing
// against the library may be CPU intensive. Since it is possible to test
@ -226,50 +227,50 @@ public class DidYouMean {
producers[2] = new DeletingOneLetter();
producers[3] = new ReversingTwoConsecutiveLetters();
for (final Thread t: producers) t.start();
// start more consumers if there are more cores
if (consumers.length > 1) for (int i = 1; i < consumers.length; i++) {
consumers[i] = new Consumer();
consumers[i].start();
}
// now decide which kind of guess is better
// we take guessLib entries as long as there is any entry in it
// to see if this is the case, we must wait for termination of the producer
for (final Thread t: producers) try { t.join(); } catch (InterruptedException e) {}
for (final Thread t: producers) try { t.join(); } catch (final InterruptedException e) {}
// if there is not any entry in guessLib, then transfer all entries from the
// guessGen to guessLib
if (createGen) try {
if (this.createGen) try {
this.guessGen.put(POISON_STRING);
String s;
StringBuilder s;
while (!(s = this.guessGen.take()).equals(POISON_STRING)) this.guessLib.put(s);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
// put poison into guessLib to terminate consumers
for (@SuppressWarnings("unused") final Consumer c: consumers)
try { guessLib.put(POISON_STRING); } catch (InterruptedException e) {}
try { this.guessLib.put(POISON_STRING); } catch (final InterruptedException e) {}
// wait for termination of consumer
for (final Consumer c: consumers)
try { c.join(); } catch (InterruptedException e) {}
try { c.join(); } catch (final InterruptedException e) {}
// we don't want the given word in the result
this.resultSet.remove(this.word);
return this.resultSet;
}
private void test(final String s) throws InterruptedException {
final Set<String> libr = LibraryProvider.dymLib.recommend(s);
private void test(final StringBuilder s) throws InterruptedException {
final Set<StringBuilder> libr = LibraryProvider.dymLib.recommend(s);
libr.addAll(LibraryProvider.geoLoc.recommend(s));
if (!libr.isEmpty()) createGen = false;
for (final String t: libr) {
guessLib.put(t);
if (!libr.isEmpty()) this.createGen = false;
for (final StringBuilder t: libr) {
this.guessLib.put(t);
}
if (createGen) {
guessGen.put(s);
if (this.createGen) {
this.guessGen.put(s);
}
}
@ -279,72 +280,78 @@ public class DidYouMean {
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
public class ChangingOneLetter extends Thread {
@Override
public void run() {
char m;
for (int i = 0; i < wordLen; i++) try {
m = word.charAt(i);
for (char c: alphabet) {
if (m != c) test(word.substring(0, i) + c + word.substring(i + 1));
if (System.currentTimeMillis() > timeLimit) return;
for (int i = 0; i < DidYouMean.this.wordLen; i++) try {
m = DidYouMean.this.word.charAt(i);
for (final char c: alphabet) {
if (m != c) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i + 1));
test(ts);
}
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}
/**
* DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len) tests.
*/
private class DeletingOneLetter extends Thread {
@Override
public void run() {
for (int i = 0; i < wordLen; i++) try {
test(word.substring(0, i) + word.substring(i+1));
if (System.currentTimeMillis() > timeLimit) return;
} catch (InterruptedException e) {}
for (int i = 0; i < DidYouMean.this.wordLen; i++) try {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.substring(i + 1));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
} catch (final InterruptedException e) {}
}
}
/**
* DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
private class AddingOneLetter extends Thread {
@Override
public void run() {
for (int i = 0; i <= wordLen; i++) try {
for (int i = 0; i <= DidYouMean.this.wordLen; i++) try {
for (final char c: alphabet) {
test(word.substring(0, i) + c + word.substring(i));
if (System.currentTimeMillis() > timeLimit) return;
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}
/**
* DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len-1) tests.
*/
private class ReversingTwoConsecutiveLetters extends Thread {
@Override
public void run() {
for (int i = 0; i < wordLen - 1; i++) try {
test(word.substring(0, i) + word.charAt(i + 1) + word.charAt(i) + word.substring(i +2));
if (System.currentTimeMillis() > timeLimit) return;
} catch (InterruptedException e) {}
for (int i = 0; i < DidYouMean.this.wordLen - 1; i++) try {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.charAt(i + 1)).append(DidYouMean.this.word.charAt(i)).append(DidYouMean.this.word.substring(i + 2));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
} catch (final InterruptedException e) {}
}
}
/**
* DidYouMean's consumer thread takes a String object (term) from the blocking queue
* and checks if it is contained in YaCy's RWI index.
@ -354,64 +361,64 @@ public class DidYouMean {
@Override
public void run() {
String s;
StringBuilder s;
try {
while ((s = guessLib.take()) != POISON_STRING) {
if (s.length() >= MinimumOutputWordLength && index.has(Word.word2hash(s))) resultSet.add(s);
if (System.currentTimeMillis() > timeLimit) return;
while ((s = DidYouMean.this.guessLib.take()) != POISON_STRING) {
if (s.length() >= MinimumOutputWordLength && DidYouMean.this.index.has(Word.word2hash(s))) DidYouMean.this.resultSet.add(s);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}
/**
* indexSizeComparator is used by DidYouMean to order terms by index.count()
* <b>Warning:</b> this causes heavy i/o
*/
private class indexSizeComparator implements Comparator<String> {
private class indexSizeComparator implements Comparator<StringBuilder> {
public int compare(final String o1, final String o2) {
final int i1 = index.count(Word.word2hash(o1));
final int i2 = index.count(Word.word2hash(o2));
public int compare(final StringBuilder o1, final StringBuilder o2) {
final int i1 = DidYouMean.this.index.count(Word.word2hash(o1));
final int i2 = DidYouMean.this.index.count(Word.word2hash(o2));
if (i1 == i2) return WORD_LENGTH_COMPARATOR.compare(o1, o2);
return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
}
}
}
/**
* wordLengthComparator is used by DidYouMean to order terms by the term length
* This is the default order if the indexSizeComparator is not used
*/
private static class wordLengthComparator implements Comparator<String> {
private static class wordLengthComparator implements Comparator<StringBuilder> {
public int compare(final String o1, final String o2) {
public int compare(final StringBuilder o1, final StringBuilder o2) {
final int i1 = o1.length();
final int i2 = o2.length();
if (i1 == i2) return o1.compareTo(o2);
if (i1 == i2) return StringBuilderComparator.CASE_INSENSITIVE_ORDER.compare(o1, o2);
return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first
}
}
/**
* headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first
*/
private static class headMatchingComparator implements Comparator<String> {
private final String head;
private final Comparator<String> secondaryComparator;
public headMatchingComparator(final String head, final Comparator<String> secondaryComparator) {
this.head = head.toLowerCase();
private static class headMatchingComparator implements Comparator<StringBuilder> {
private final StringBuilder head;
private final Comparator<StringBuilder> secondaryComparator;
public headMatchingComparator(final StringBuilder head, final Comparator<StringBuilder> secondaryComparator) {
this.head = head;
this.secondaryComparator = secondaryComparator;
}
public int compare(final String o1, final String o2) {
boolean o1m = o1.toLowerCase().startsWith(head);
boolean o2m = o2.toLowerCase().startsWith(head);
if ((o1m && o2m) || (!o1m && !o2m)) return secondaryComparator.compare(o1, o2);
public int compare(final StringBuilder o1, final StringBuilder o2) {
final boolean o1m = StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(o1, this.head);
final boolean o2m = StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(o2, this.head);
if ((o1m && o2m) || (!o1m && !o2m)) return this.secondaryComparator.compare(o1, o2);
return o1m ? -1 : 1;
}
}
}

@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>();
// final TreeMap<String, YMarkTag> pairs = new TreeMap<String, YMarkTag>();
String token;
StringBuilder token;
// StringBuilder pair = new StringBuilder(64);
if(document != null) {
@ -100,7 +100,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
buffer.append(document.dc_title());
buffer.append(document.dc_description());
buffer.append(document.dc_subject(' '));
final Enumeration<String> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
int count = 0;
@ -133,7 +133,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
if (token.length()>3) {
count = word.occurrences() * 100;
}
topwords.add(new YMarkTag(token, count));
topwords.add(new YMarkTag(token.toString(), count));
}
}
count = 0;

@ -118,6 +118,27 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
}
}
/**
* Add a key-value pair of Objects to the map.
* @param key This method will do nothing if the key is <code>null</code>.
* @param value The value that should be mapped to the key.
* If value is <code>null</code>, then the element at <code>key</code>
* is removed from the map.
* @return The value that was added to the map.
* @see java.util.Hashtable#insert(K, V)
*/
public void put(final String key, final StringBuilder value) {
if (key == null) {
// this does nothing
return;
} else if (value == null) {
// assigning the null value creates the same effect like removing the element
super.remove(key);
} else {
super.put(key, value.toString());
}
}
/**
* Add byte array to the map, value is kept as it is.
* @param key key name as String.
@ -165,6 +186,10 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
return put(key, toJSON(value));
}
public String putJSON(final String key, final StringBuilder value) {
return put(key, toJSON(value.toString()));
}
private static String toJSON(String value) {
// value = value.replaceAll("\\", "\\\\");
value = patternDoublequote.matcher(value).replaceAll("'");

@ -154,6 +154,11 @@ public class UTF8 {
return s.getBytes(charset);
}
public final static byte[] getBytes(final StringBuilder s) {
if (s == null) return null;
return s.toString().getBytes(charset);
}
/**
* Decodes a <code>application/x-www-form-urlencoded</code> string using a specific
* encoding scheme.
@ -179,15 +184,22 @@ public class UTF8 {
int pos = 0;
while (((i+2) < numChars) && (c=='%')) {
final int v = Integer.parseInt(s.substring(i+1,i+3),16);
if (v < 0) throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value");
if (v < 0) {
return s;
//throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value");
}
bytes[pos++] = (byte) v;
i+= 3;
if (i < numChars) c = s.charAt(i);
}
if ((i < numChars) && (c=='%')) throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern");
if ((i < numChars) && (c=='%')) {
return s;
//throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern");
}
sb.append(new String(bytes, 0, pos, charset));
} catch (final NumberFormatException e) {
throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage());
return s;
//throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage());
}
needToChange = true;
break;

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -26,6 +26,7 @@ package net.yacy.cora.ranking;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
@ -37,77 +38,84 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.storage.OutOfLimitsException;
public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements ReversibleScoreMap<E> {
protected final Map<E, Long> map; // a mapping from a reference to the cluster key
protected final TreeMap<Long, E> pam; // a mapping from the cluster key to the reference
private long gcount;
private int encnt;
public ClusteredScoreMap() {
map = new TreeMap<E, Long>();
pam = new TreeMap<Long, E>();
gcount = 0;
encnt = 0;
this.map = new TreeMap<E, Long>();
this.pam = new TreeMap<Long, E>();
this.gcount = 0;
this.encnt = 0;
}
public ClusteredScoreMap(final Comparator<E> c) {
this.map = new TreeMap<E, Long>(c);
this.pam = new TreeMap<Long, E>();
this.gcount = 0;
this.encnt = 0;
}
public Iterator<E> iterator() {
return map.keySet().iterator();
return this.map.keySet().iterator();
}
public synchronized void clear() {
map.clear();
pam.clear();
gcount = 0;
encnt = 0;
this.map.clear();
this.pam.clear();
this.gcount = 0;
this.encnt = 0;
}
/**
* shrink the cluster to a demanded size
* @param maxsize
*/
public void shrinkToMaxSize(int maxsize) {
public void shrinkToMaxSize(final int maxsize) {
if (maxsize < 0) return;
Long key;
synchronized (this) {
while (map.size() > maxsize) {
while (this.map.size() > maxsize) {
// find and remove smallest objects until cluster has demanded size
key = pam.firstKey();
key = this.pam.firstKey();
if (key == null) break;
map.remove(pam.remove(key));
this.map.remove(this.pam.remove(key));
}
}
}
/**
* shrink the cluster in such a way that the smallest score is equal or greater than a given minScore
* @param minScore
*/
public void shrinkToMinScore(int minScore) {
public void shrinkToMinScore(final int minScore) {
int score;
Long key;
synchronized (this) {
while (pam.size() > 0) {
while (this.pam.size() > 0) {
// find and remove objects where their score is smaller than the demanded minimum score
key = pam.firstKey();
key = this.pam.firstKey();
if (key == null) break;
score = (int) ((key.longValue() & 0xFFFFFFFF00000000L) >> 32);
if (score >= minScore) break;
map.remove(pam.remove(key));
this.map.remove(this.pam.remove(key));
}
}
}
public static final String shortDateFormatString = "yyyyMMddHHmmss";
public static final SimpleDateFormat shortFormatter = new SimpleDateFormat(shortDateFormatString, Locale.US);
public static final long minutemillis = 60000;
public static long date2000 = 0;
static {
try {
date2000 = shortFormatter.parse("20000101000000").getTime();
} catch (final ParseException e) {}
}
public static int object2score(Object o) {
if (o instanceof Integer) return ((Integer) o).intValue();
if (o instanceof Long) {
@ -126,7 +134,7 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
String s = null;
if (o instanceof String) s = (String) o;
if (o instanceof byte[]) s = UTF8.String((byte[]) o);
// this can be used to calculate a score from a string
if (s == null || s.length() == 0 || s.charAt(0) == '-') return 0;
try {
@ -163,7 +171,7 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
return c;
}
}
private static final byte[] plainByteArray = new byte[256];
static {
for (int i = 0; i < 32; i++) plainByteArray[i] = (byte) i;
@ -171,235 +179,235 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
for (int i = 96; i < 128; i++) plainByteArray[i] = (byte) (i - 64);
for (int i = 128; i < 256; i++) plainByteArray[i] = (byte) (i & 0X20);
}
private long scoreKey(final int elementNr, final int elementCount) {
return (((elementCount & 0xFFFFFFFFL)) << 32) | ((elementNr & 0xFFFFFFFFL));
}
public synchronized long totalCount() {
return gcount;
return this.gcount;
}
public synchronized int size() {
return map.size();
return this.map.size();
}
public boolean sizeSmaller(int size) {
return map.size() < size;
public boolean sizeSmaller(final int size) {
return this.map.size() < size;
}
public synchronized boolean isEmpty() {
return map.isEmpty();
return this.map.isEmpty();
}
public synchronized void inc(final E obj) {
inc(obj, 1);
}
public synchronized void dec(final E obj) {
inc(obj, -1);
}
public void set(final E obj, final int newScore) {
if (obj == null) return;
synchronized (this) {
Long usk = map.remove(obj); // get unique score key, old entry is not needed any more
Long usk = this.map.remove(obj); // get unique score key, old entry is not needed any more
if (newScore < 0) throw new OutOfLimitsException(newScore);
if (usk == null) {
// set new value
usk = Long.valueOf(scoreKey(encnt++, newScore));
usk = Long.valueOf(scoreKey(this.encnt++, newScore));
// put new value into cluster
map.put(obj, usk);
pam.put(usk, obj);
this.map.put(obj, usk);
this.pam.put(usk, obj);
} else {
// delete old entry
pam.remove(usk);
this.pam.remove(usk);
// get previous handle and score
final long c = usk.longValue();
final int oldScore = (int) ((c & 0xFFFFFFFF00000000L) >> 32);
final int oldHandle = (int) (c & 0xFFFFFFFFL);
gcount -= oldScore;
this.gcount -= oldScore;
// set new value
usk = Long.valueOf(scoreKey(oldHandle, newScore)); // generates an unique key for a specific score
map.put(obj, usk);
pam.put(usk, obj);
this.map.put(obj, usk);
this.pam.put(usk, obj);
}
}
}
// increase overall counter
gcount += newScore;
this.gcount += newScore;
}
public void inc(final E obj, final int incrementScore) {
if (obj == null) return;
synchronized (this) {
Long usk = map.remove(obj); // get unique score key, old entry is not needed any more
Long usk = this.map.remove(obj); // get unique score key, old entry is not needed any more
if (usk == null) {
// set new value
if (incrementScore < 0) throw new OutOfLimitsException(incrementScore);
usk = Long.valueOf(scoreKey(encnt++, incrementScore));
usk = Long.valueOf(scoreKey(this.encnt++, incrementScore));
// put new value into cluster
map.put(obj, usk);
pam.put(usk, obj);
this.map.put(obj, usk);
this.pam.put(usk, obj);
} else {
// delete old entry
pam.remove(usk);
this.pam.remove(usk);
// get previous handle and score
final long c = usk.longValue();
final int oldScore = (int) ((c & 0xFFFFFFFF00000000L) >> 32);
final int oldHandle = (int) (c & 0xFFFFFFFFL);
// set new value
final int newValue = oldScore + incrementScore;
if (newValue < 0) throw new OutOfLimitsException(newValue);
usk = Long.valueOf(scoreKey(oldHandle, newValue)); // generates an unique key for a specific score
map.put(obj, usk);
pam.put(usk, obj);
this.map.put(obj, usk);
this.pam.put(usk, obj);
}
}
}
// increase overall counter
gcount += incrementScore;
this.gcount += incrementScore;
}
public void dec(final E obj, final int incrementScore) {
inc(obj, -incrementScore);
}
public int delete(final E obj) {
// deletes entry and returns previous score
if (obj == null) return 0;
final Long usk;
synchronized (this) {
usk = map.remove(obj); // get unique score key, old entry is not needed any more
usk = this.map.remove(obj); // get unique score key, old entry is not needed any more
if (usk == null) return 0;
// delete old entry
pam.remove(usk);
this.pam.remove(usk);
}
// get previous handle and score
final int oldScore = (int) ((usk.longValue() & 0xFFFFFFFF00000000L) >> 32);
// decrease overall counter
gcount -= oldScore;
return oldScore;
this.gcount -= oldScore;
return oldScore;
}
public synchronized boolean containsKey(final E obj) {
return map.containsKey(obj);
return this.map.containsKey(obj);
}
public int get(final E obj) {
if (obj == null) return 0;
final Long cs;
synchronized (this) {
cs = map.get(obj);
cs = this.map.get(obj);
}
if (cs == null) return 0;
return (int) ((cs.longValue() & 0xFFFFFFFF00000000L) >> 32);
}
public synchronized int getMaxScore() {
if (map.isEmpty()) return -1;
return (int) ((pam.lastKey().longValue() & 0xFFFFFFFF00000000L) >> 32);
if (this.map.isEmpty()) return -1;
return (int) ((this.pam.lastKey().longValue() & 0xFFFFFFFF00000000L) >> 32);
}
public synchronized int getMinScore() {
if (map.isEmpty()) return -1;
return (int) ((pam.firstKey().longValue() & 0xFFFFFFFF00000000L) >> 32);
if (this.map.isEmpty()) return -1;
return (int) ((this.pam.firstKey().longValue() & 0xFFFFFFFF00000000L) >> 32);
}
public synchronized E getMaxKey() {
if (map.isEmpty()) return null;
return pam.get(pam.lastKey());
if (this.map.isEmpty()) return null;
return this.pam.get(this.pam.lastKey());
}
public synchronized E getMinKey() {
if (map.isEmpty()) return null;
return pam.get(pam.firstKey());
if (this.map.isEmpty()) return null;
return this.pam.get(this.pam.firstKey());
}
public String toString() {
return map + " / " + pam;
return this.map + " / " + this.pam;
}
public synchronized Iterator<E> keys(final boolean up) {
if (up) return new simpleScoreIterator<E>();
return new reverseScoreIterator<E>();
}
private class reverseScoreIterator<A extends E> implements Iterator<E> {
SortedMap<Long, E> view;
Long key;
public reverseScoreIterator() {
view = pam;
this.view = ClusteredScoreMap.this.pam;
}
public boolean hasNext() {
return !view.isEmpty();
return !this.view.isEmpty();
}
public E next() {
key = view.lastKey();
view = view.headMap(key);
final E value = pam.get(key);
this.key = this.view.lastKey();
this.view = this.view.headMap(this.key);
final E value = ClusteredScoreMap.this.pam.get(this.key);
//System.out.println("cluster reverse iterator: score = " + ((((Long) key).longValue() & 0xFFFFFFFF00000000L) >> 32) + ", handle = " + (((Long) key).longValue() & 0xFFFFFFFFL) + ", value = " + value);
return value;
}
public void remove() {
final Object val = pam.remove(key);
if (val != null) map.remove(val);
final Object val = ClusteredScoreMap.this.pam.remove(this.key);
if (val != null) ClusteredScoreMap.this.map.remove(val);
}
}
private class simpleScoreIterator<A extends E> implements Iterator<E> {
Iterator<Map.Entry<Long, E>> ii;
Map.Entry<Long, E> entry;
public simpleScoreIterator() {
ii = pam.entrySet().iterator();
this.ii = ClusteredScoreMap.this.pam.entrySet().iterator();
}
public boolean hasNext() {
return ii.hasNext();
return this.ii.hasNext();
}
public E next() {
entry = ii.next();
this.entry = this.ii.next();
//System.out.println("cluster simple iterator: score = " + ((((Long) entry.getKey()).longValue() & 0xFFFFFFFF00000000L) >> 32) + ", handle = " + (((Long) entry.getKey()).longValue() & 0xFFFFFFFFL) + ", value = " + entry.getValue());
return entry.getValue();
return this.entry.getValue();
}
public void remove() {
ii.remove();
if (entry.getValue() != null) map.remove(entry.getValue());
this.ii.remove();
if (this.entry.getValue() != null) ClusteredScoreMap.this.map.remove(this.entry.getValue());
}
}
public static void main(final String[] args) {
final String t = "ZZZZZZZZZZ";
System.out.println("score of " + t + ": " + object2score(t));
if (args.length > 0) {
System.out.println("score of " + args[0] + ": " + object2score(args[0]));
System.exit(0);
}
System.out.println("Test for Score: start");
final ClusteredScoreMap<String> s = new ClusteredScoreMap<String>();
long c = 0;
@ -409,14 +417,14 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
final Random random = new Random(1234);
int r;
final int count = 20;
for (int x = 0; x < 100000; x++) {
for (int i = 0; i < count; i++) {
r = Math.abs(random.nextInt(100));
s.inc("score#" + r, r);
c += r;
}
// delete some
int p;
for (int i = 0; i < (count / 2); i++) {
@ -429,13 +437,13 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
}
System.out.println("finished create. time = " + (System.currentTimeMillis() - time));
System.out.println("result:");
Iterator<String> i = s.keys(true);
while (i.hasNext()) System.out.println("up: " + i.next());
i = s.keys(false);
while (i.hasNext()) System.out.println("down: " + i.next());
System.out.println("total=" + s.totalCount() + ", elements=" + s.size() + ", redundant count=" + c);
}
}

@ -7,12 +7,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -37,7 +37,6 @@ import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
@ -51,7 +50,7 @@ import net.yacy.kelondro.util.SetTools;
public final class Condenser {
// this is the page analysis class
public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
public final static int wordminsize = 2;
@ -82,21 +81,21 @@ public final class Condenser {
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
private final static int numlength = 5;
//private Properties analysis;
private Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
//public int RESULT_NUMB_TEXT_BYTES = -1;
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public Bitfield RESULT_FLAGS = new Bitfield(4);
private Identificator languageIdentificator;
private final Identificator languageIdentificator;
private final NumberFormat intStringFormatter = NumberFormat.getIntegerInstance(); // use a new instance for each object for a better concurrency
public Condenser(
final Document document,
final boolean indexText,
@ -112,15 +111,15 @@ public final class Condenser {
this.RESULT_FLAGS = new Bitfield(4);
// construct flag set for document
if (!document.getImages().isEmpty()) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (!document.getAudiolinks().isEmpty()) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (!document.getVideolinks().isEmpty()) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (!document.getApplinks().isEmpty()) RESULT_FLAGS.set(flag_cat_hasapp, true);
if (document.lat() != 0.0f && document.lon() != 0.0f) RESULT_FLAGS.set(flag_cat_haslocation, true);
if (!document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
if (!document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (!document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (!document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
this.languageIdentificator = new Identificator();
Map.Entry<MultiProtocolURI, String> entry;
if (indexText) {
assert document.getText() != null : document.dc_identifier();
@ -137,18 +136,18 @@ public final class Condenser {
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
// missing: tags!
final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true, meaningLib);
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
}
// anchors: for text indexing we add only the anchor description
// REMOVED! Reason:
// words from the anchor description should appear as normal text in the output from the parser
@ -169,9 +168,9 @@ public final class Condenser {
this.RESULT_NUMB_SENTENCES = 0;
this.RESULT_DIFF_SENTENCES = 0;
}
// add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false, meaningLib);
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
@ -179,24 +178,24 @@ public final class Condenser {
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true, meaningLib);
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true, meaningLib);
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true, meaningLib);
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
}
// images
@ -207,25 +206,25 @@ public final class Condenser {
ientry = j.next();
url = ientry.url();
if (url == null) continue;
insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
}
// finally check all words for missing flag entry
final Iterator<Map.Entry<String, Word>> k = words.entrySet().iterator();
final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
Word wprop;
Map.Entry<String, Word> we;
while (k.hasNext()) {
we = k.next();
wprop = we.getValue();
if (wprop.flags == null) {
wprop.flags = RESULT_FLAGS.clone();
words.put(we.getKey(), wprop);
wprop.flags = this.RESULT_FLAGS.clone();
this.words.put(we.getKey(), wprop);
}
}
}
}
private void insertTextToWords(
final String text,
final int phrase,
@ -241,13 +240,13 @@ public final class Condenser {
int pip = 0;
while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
if (useForLanguageIdentification) languageIdentificator.add(word);
if (useForLanguageIdentification) this.languageIdentificator.add(word);
if (word.length() < 2) continue;
wprop = words.get(word);
wprop = this.words.get(word);
if (wprop == null) wprop = new Word(0, pip, phrase);
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
wprop.flags.set(flagpos, true);
words.put(word, wprop);
this.words.put(word, wprop);
pip++;
this.RESULT_NUMB_WORDS++;
this.RESULT_DIFF_WORDS++;
@ -257,23 +256,23 @@ public final class Condenser {
public Condenser(final InputStream text, final WordCache meaningLib) {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
words = new TreeMap<String, Word>();
this.words = new TreeMap<String, Word>();
createCondensement(text, meaningLib);
}
public int excludeWords(final SortedSet<String> stopwords) {
// subtracts the given stopwords from the word list
// the word list shrinkes. This returns the number of shrinked words
final int oldsize = words.size();
SetTools.excludeDestructive(words, stopwords);
return oldsize - words.size();
final int oldsize = this.words.size();
SetTools.excludeDestructive(this.words, stopwords);
return oldsize - this.words.size();
}
public Map<String, Word> words() {
// returns the words as word/indexWord relation map
return words;
return this.words;
}
public String language() {
return this.languageIdentificator.getLanguage();
}
@ -284,23 +283,24 @@ public final class Condenser {
String word = "";
String k;
int wordlen;
Word wsp, wsp1;
Word wsp;
final Word wsp1;
int wordHandle;
int wordHandleCount = 0;
int sentenceHandleCount = 0;
final int sentenceHandleCount = 0;
int allwordcounter = 0;
int allsentencecounter = 0;
final int allsentencecounter = 0;
int wordInSentenceCounter = 1;
boolean comb_indexof = false, last_last = false, last_index = false;
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
// read source
final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toLowerCase(Locale.ENGLISH);
if (languageIdentificator != null) languageIdentificator.add(word);
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
if (this.languageIdentificator != null) this.languageIdentificator.add(word);
if (word.length() < wordminsize) continue;
// distinguish punctuation and words
wordlen = word.length();
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
@ -316,11 +316,11 @@ public final class Condenser {
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
// store word
allwordcounter++;
currsentwords.add(word);
wsp = words.get(word);
wsp = this.words.get(word);
if (wsp != null) {
// word already exists
wordHandle = wsp.posInText;
@ -329,8 +329,8 @@ public final class Condenser {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
wsp.flags = RESULT_FLAGS.clone();
words.put(word, wsp);
wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word, wsp);
}
// we now have the unique handle of the word, put it into the sentence:
wordInSentenceCounter++;
@ -341,7 +341,7 @@ public final class Condenser {
Map.Entry<String, Word> entry;
// we search for similar words and reorganize the corresponding sentences
// a word is similar, if a shortened version is equal
final Iterator<Map.Entry<String, Word>> wi = words.entrySet().iterator(); // enumerates the keys in descending order
final Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
wordsearch: while (wi.hasNext()) {
entry = wi.next();
word = entry.getKey();
@ -350,10 +350,10 @@ public final class Condenser {
for (int i = wordcut; i > 0; i--) {
if (wordlen > i) {
k = word.substring(0, wordlen - i);
if (words.containsKey(k)) {
if (this.words.containsKey(k)) {
// update word counter
wsp1.count = wsp1.count + wsp.count;
words.put(k, wsp1);
this.words.put(k, wsp1);
// remove current word
wi.remove();
continue wordsearch;
@ -370,7 +370,7 @@ public final class Condenser {
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
}
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map
if (text == null) return null;
@ -378,7 +378,7 @@ public final class Condenser {
buffer = new ByteArrayInputStream(UTF8.getBytes(text));
return new Condenser(buffer, meaningLib).words();
}
public static void main(final String[] args) {
// read a property file and convert them into configuration lines
try {
@ -391,8 +391,8 @@ public final class Condenser {
sb.append('"');
final String s = p.getProperty("keywords" + i);
final String[] l = s.split(",");
for (int j = 0; j < l.length; j++) {
sb.append(ASCII.String(Word.word2hash(l[j])));
for (final String element : l) {
sb.append(ASCII.String(Word.word2hash(element)));
}
if (i < 15) sb.append(",\n");
}
@ -403,7 +403,7 @@ public final class Condenser {
} catch (final IOException e) {
Log.logException(e);
}
}
}

@ -255,8 +255,8 @@ public class LibraryProvider {
final File here = new File("dummy").getParentFile();
initialize(new File(here, "DATA/DICTIONARIES"));
System.out.println("dymDict-size = " + dymLib.size());
final Set<String> r = dymLib.recommend("da");
for (final String s: r) {
final Set<StringBuilder> r = dymLib.recommend(new StringBuilder("da"));
for (final StringBuilder s: r) {
System.out.println("$ " + s);
}
System.out.println("recommendations: " + r.size());

@ -0,0 +1,147 @@
/**
* CaseInsensitiveStringBuilderComparator.java
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 09.11.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.regex.Pattern;
/**
* case-insensitive compare of two StringBuilder objects
* this shall replace the corresponding method in class String when StringBuilder objects are not transformed into string
*/
public class StringBuilderComparator implements Comparator<StringBuilder> {
public static final StringBuilderComparator CASE_SENSITIVE_ORDER = new StringBuilderComparator(false);
public static final StringBuilderComparator CASE_INSENSITIVE_ORDER = new StringBuilderComparator(true);
private final boolean caseInsensitive;
public StringBuilderComparator(final boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}
public int compare(final StringBuilder sb0, final StringBuilder sb1) {
final int l0 = sb0.length();
final int l1 = sb1.length();
final int ml = Math.min(l0, l1);
char c0, c1;
for (int i = 0; i < ml; i++) {
c0 = sb0.charAt(i);
c1 = sb1.charAt(i);
if (c0 == c1) continue;
if (this.caseInsensitive) {
c0 = Character.toUpperCase(c0);
c1 = Character.toUpperCase(c1);
if (c0 == c1) continue;
c0 = Character.toLowerCase(c0);
c1 = Character.toLowerCase(c1);
if (c0 == c1) continue;
}
return c0 - c1;
}
return l0 - l1;
}
public boolean equals(final StringBuilder sb0, final StringBuilder sb1) {
final int l0 = sb0.length();
final int l1 = sb1.length();
if (l0 != l1) return false;
return equals(sb0, sb1, l1);
}
public boolean startsWith(final StringBuilder sb0, final StringBuilder sb1) {
final int l0 = sb0.length();
final int l1 = sb1.length();
if (l0 < l1) return false;
return equals(sb0, sb1, l1);
}
private boolean equals(final StringBuilder sb0, final StringBuilder sb1, final int l) {
char c0, c1;
for (int i = 0; i < l; i++) {
c0 = sb0.charAt(i);
c1 = sb1.charAt(i);
if (c0 == c1) continue;
if (this.caseInsensitive) {
c0 = Character.toUpperCase(c0);
c1 = Character.toUpperCase(c1);
if (c0 == c1) continue;
c0 = Character.toLowerCase(c0);
c1 = Character.toLowerCase(c1);
if (c0 == c1) continue;
}
return false;
}
return true;
}
// methods that can be useful for StringBuilder as replacement of String
public int indexOf(final StringBuilder sb, final char ch) {
final int max = sb.length();
for (int i = 0; i < max ; i++) {
if (sb.charAt(i) == ch) return i;
}
return -1;
}
public int indexOf(final StringBuilder sb, final int off, final char ch) {
final int max = sb.length();
for (int i = off; i < max ; i++) {
if (sb.charAt(i) == ch) return i;
}
return -1;
}
public StringBuilder[] split(final StringBuilder sb, final char c) {
int next = 0;
int off = 0;
final ArrayList<String> list = new ArrayList<String>();
while ((next = indexOf(sb, off, c)) != -1) {
list.add(sb.substring(off, next));
off = next + 1;
}
if (off == 0) return new StringBuilder[] { sb };
list.add(sb.substring(off, sb.length()));
int resultSize = list.size();
while (resultSize > 0 && list.get(resultSize - 1).length() == 0) resultSize--;
final StringBuilder[] result = new StringBuilder[resultSize];
for (int i = 0; i < resultSize; i++) result[i] = new StringBuilder(list.get(i));
return result;
}
public static StringBuilder[] split(final StringBuilder sb, final Pattern pattern) {
final String[] p = pattern.split(sb);
final StringBuilder[] h = new StringBuilder[p.length];
for (int i = 0; i < p.length; i++) h[i] = new StringBuilder(p[i]);
return h;
}
public static void main(final String[] args) {
final StringBuilder s = new StringBuilder("ene mene mu");
final StringBuilder[] t = StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(s, ' ');
for (final StringBuilder u: t) System.out.println(u.toString());
}
}

@ -49,12 +49,12 @@ public class WordCache {
// common word cache
private static final int commonWordsMaxSize = 100000; // maximum size of common word cache
private static final int commonWordsMinLength = 5; // words must have that length at minimum
private static OrderedScoreMap<String> commonWords = new OrderedScoreMap<String>(String.CASE_INSENSITIVE_ORDER);
private static OrderedScoreMap<StringBuilder> commonWords = new OrderedScoreMap<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
// dictionaries
private final File dictionaryPath;
private TreeSet<String> dict; // the word dictionary
private TreeSet<String> tcid; // the dictionary of reverse words
private TreeSet<StringBuilder> dict; // the word dictionary
private TreeSet<StringBuilder> tcid; // the dictionary of reverse words
/**
* create a new dictionary
@ -72,7 +72,7 @@ public class WordCache {
* add a word to the generic dictionary
* @param word
*/
public static void learn(final String word) {
public static void learn(final StringBuilder word) {
if (word == null) return;
if (word.length() < commonWordsMinLength) return;
if (MemoryControl.shortStatus()) commonWords.clear();
@ -86,8 +86,8 @@ public class WordCache {
* scan the input directory and load all dictionaries (again)
*/
public void reload() {
this.dict = new TreeSet<String>();
this.tcid = new TreeSet<String>();
this.dict = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
this.tcid = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
if (this.dictionaryPath == null || !this.dictionaryPath.exists()) return;
final String[] files = this.dictionaryPath.list();
for (final String f: files) {
@ -106,25 +106,27 @@ public class WordCache {
}
final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String l;
StringBuilder sb;
try {
while ((l = reader.readLine()) != null) {
if (l.length() == 0 || l.charAt(0) == '#') continue;
l = l.trim().toLowerCase();
if (l.length() < 4) continue;
this.dict.add(l);
this.tcid.add(reverse(l));
sb = new StringBuilder(l);
this.dict.add(sb);
this.tcid.add(reverse(sb));
}
} catch (final IOException e) {
// finish
}
}
private static String reverse(final String s) {
private static StringBuilder reverse(final StringBuilder s) {
final StringBuilder sb = new StringBuilder(s.length());
for (int i = s.length() - 1; i >= 0; i--) {
sb.append(s.charAt(i));
}
return sb.toString();
return sb;
}
/**
@ -132,25 +134,24 @@ public class WordCache {
* @param s input value that is used to match recommendations
* @return set that contains all words that start or end with the input value
*/
public Set<String> recommend(final String s) {
final Set<String> ret = new HashSet<String>();
String string = s.trim().toLowerCase();
SortedSet<String> t = this.dict.tailSet(string);
for (final String r: t) {
if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break;
public Set<StringBuilder> recommend(StringBuilder string) {
final Set<StringBuilder> ret = new HashSet<StringBuilder>();
SortedSet<StringBuilder> t = this.dict.tailSet(string);
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(r); else break;
}
final SortedMap<String, AtomicInteger> u = commonWords.tailMap(string);
String vv;
final SortedMap<StringBuilder, AtomicInteger> u = commonWords.tailMap(string);
StringBuilder vv;
try {
for (final Map.Entry<String, AtomicInteger> v: u.entrySet()) {
for (final Map.Entry<StringBuilder, AtomicInteger> v: u.entrySet()) {
vv = v.getKey();
if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break;
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(vv, string) && vv.length() > string.length()) ret.add(vv); else break;
}
} catch (final ConcurrentModificationException e) {}
string = reverse(string);
t = this.tcid.tailSet(string);
for (final String r: t) {
if (r.startsWith(string) && r.length() > string.length()) ret.add(reverse(r)); else break;
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(reverse(r)); else break;
}
return ret;
}
@ -160,8 +161,8 @@ public class WordCache {
* @param s the given word
* @return true if the library contains the word
*/
public boolean contains(final String s) {
return this.dict.contains(s.trim().toLowerCase());
public boolean contains(final StringBuilder s) {
return this.dict.contains(s);
// if the above case is true then it is also true for this.tcid and vice versa
// that means it does not need to be tested as well
}
@ -173,16 +174,15 @@ public class WordCache {
* @param s the given word
* @return true if the library supports the word
*/
public boolean supports(final String s) {
String string = s.trim().toLowerCase();
SortedSet<String> t = this.dict.tailSet(string);
for (final String r: t) {
if (string.startsWith(r)) return true; else break;
public boolean supports(StringBuilder string) {
SortedSet<StringBuilder> t = this.dict.tailSet(string);
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break;
}
string = reverse(string);
t = this.tcid.tailSet(string);
for (final String r: t) {
if (string.startsWith(r)) return true; else break;
for (final StringBuilder r: t) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break;
}
return false;
}

@ -37,7 +37,7 @@ import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.order.Base64Order;
public class WordTokenizer implements Enumeration<String> {
public class WordTokenizer implements Enumeration<StringBuilder> {
// this enumeration removes all words that contain either wrong characters or are too short
private StringBuilder buffer = null;
@ -72,8 +72,8 @@ public class WordTokenizer implements Enumeration<String> {
return this.buffer != null;
}
public String nextElement() {
final String r = (this.buffer == null) ? null : this.buffer.toString();
public StringBuilder nextElement() {
final StringBuilder r = (this.buffer == null) ? null : this.buffer;
this.buffer = nextElement0();
// put word to words statistics cache
if (this.meaningLib != null) WordCache.learn(r);
@ -172,14 +172,14 @@ public class WordTokenizer implements Enumeration<String> {
*/
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final Enumeration<String> words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
final Enumeration<StringBuilder> words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
int pos = 0;
String word;
StringBuilder word;
byte[] hash;
Integer oldpos;
while (words.hasMoreElements()) {
word = words.nextElement();
hash = Word.word2hash(word.toString());
hash = Word.word2hash(word);
// don't overwrite old values, that leads to too far word distances
oldpos = map.put(hash, LargeNumberCache.valueOf(pos));

@ -2,19 +2,19 @@
* GeonamesLocalization.java
* Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 16.05.2010 on http://yacy.net
*
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -27,12 +27,10 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
@ -41,6 +39,7 @@ import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import net.yacy.document.StringBuilderComparator;
import net.yacy.kelondro.logging.Log;
public class GeonamesLocalization implements Localization {
@ -59,42 +58,35 @@ public class GeonamesLocalization implements Localization {
country code : ISO-3166 2-letter country code, 2 characters
cc2 : alternate country codes, comma separated, ISO-3166 2-letter country code, 60 characters
admin1 code : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)
admin2 code : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)
admin2 code : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)
admin3 code : code for third level administrative division, varchar(20)
admin4 code : code for fourth level administrative division, varchar(20)
population : bigint (8 byte int)
population : bigint (8 byte int)
elevation : in meters, integer
gtopo30 : average elevation of 30'x30' (ca 900mx900m) area in meters, integer
timezone : the timezone id (see file timeZone.txt)
modification date : date of last modification in yyyy-MM-dd format
*/
// use a collator to relax when distinguishing between lowercase und uppercase letters
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
static {
insensitiveCollator.setStrength(Collator.SECONDARY);
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
private final Map<Integer, Location> id2loc;
private final TreeMap<String, List<Integer>> name2ids;
private final TreeMap<StringBuilder, List<Integer>> name2ids;
private final File file;
public GeonamesLocalization(final File file) {
// this is a processing of the cities1000.zip file from http://download.geonames.org/export/dump/
this.file = file;
this.id2loc = new HashMap<Integer, Location>();
this.name2ids = new TreeMap<String, List<Integer>>(insensitiveCollator);
this.name2ids = new TreeMap<StringBuilder, List<Integer>>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
if (file == null || !file.exists()) return;
BufferedReader reader;
try {
ZipFile zf = new ZipFile(file);
ZipEntry ze = zf.getEntry("cities1000.txt");
InputStream is = zf.getInputStream(ze);
final ZipFile zf = new ZipFile(file);
final ZipEntry ze = zf.getEntry("cities1000.txt");
final InputStream is = zf.getInputStream(ze);
reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
return;
}
@ -103,61 +95,71 @@ public class GeonamesLocalization implements Localization {
try {
String line;
String[] fields;
Set<String> locnames;
Set<StringBuilder> locnames;
while ((line = reader.readLine()) != null) {
if (line.length() == 0) continue;
fields = line.split("\t");
int id = Integer.parseInt(fields[0]);
locnames = new HashSet<String>();
locnames.add(fields[1]);
locnames.add(fields[2]);
for (String s: fields[3].split(",")) locnames.add(s);
Location c = new Location(Float.parseFloat(fields[5]), Float.parseFloat(fields[4]), fields[1]);
final int id = Integer.parseInt(fields[0]);
locnames = new HashSet<StringBuilder>();
locnames.add(new StringBuilder(fields[1]));
locnames.add(new StringBuilder(fields[2]));
for (final String s: fields[3].split(",")) locnames.add(new StringBuilder(s));
final Location c = new Location(Float.parseFloat(fields[5]), Float.parseFloat(fields[4]), fields[1]);
c.setPopulation((int) Long.parseLong(fields[14]));
this.id2loc.put(id, c);
for (String name: locnames) {
for (final StringBuilder name: locnames) {
List<Integer> locs = this.name2ids.get(name);
if (locs == null) locs = new ArrayList<Integer>(1);
locs.add(id);
this.name2ids.put(name, locs);
}
}
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
}
public int locations() {
return id2loc.size();
return this.id2loc.size();
}
public TreeSet<Location> find(String anyname, boolean locationexact) {
Set<Integer> r = new HashSet<Integer>();
public TreeSet<Location> find(final String anyname, final boolean locationexact) {
final Set<Integer> r = new HashSet<Integer>();
List<Integer> c;
final StringBuilder an = new StringBuilder(anyname);
if (locationexact) {
c = this.name2ids.get(anyname); if (c != null) r.addAll(c);
} else {
SortedMap<String, List<Integer>> cities = this.name2ids.tailMap(anyname);
for (Map.Entry<String, List<Integer>> e: cities.entrySet()) {
if (e.getKey().toLowerCase().startsWith(anyname.toLowerCase())) r.addAll(e.getValue()); else break;
final SortedMap<StringBuilder, List<Integer>> cities = this.name2ids.tailMap(an);
for (final Map.Entry<StringBuilder, List<Integer>> e: cities.entrySet()) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(e.getKey(), an)) r.addAll(e.getValue()); else break;
}
}
TreeSet<Location> a = new TreeSet<Location>();
for (Integer e: r) {
Location w = this.id2loc.get(e);
final TreeSet<Location> a = new TreeSet<Location>();
for (final Integer e: r) {
final Location w = this.id2loc.get(e);
if (w != null) a.add(w);
}
return a;
}
public Set<String> recommend(String s) {
Set<String> a = new HashSet<String>();
s = s.trim().toLowerCase();
public Set<String> recommend(final String s) {
final Set<String> a = new HashSet<String>();
final StringBuilder an = new StringBuilder(s);
if (s.length() == 0) return a;
SortedMap<String, List<Integer>> t = this.name2ids.tailMap(s);
for (String r: t.keySet()) {
r = r.toLowerCase();
if (r.startsWith(s)) a.add(r); else break;
final SortedMap<StringBuilder, List<Integer>> t = this.name2ids.tailMap(an);
for (final StringBuilder r: t.keySet()) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, an)) a.add(r.toString()); else break;
}
return a;
}
public Set<StringBuilder> recommend(final StringBuilder s) {
final Set<StringBuilder> a = new HashSet<StringBuilder>();
if (s.length() == 0) return a;
final SortedMap<StringBuilder, List<Integer>> t = this.name2ids.tailMap(s);
for (final StringBuilder r: t.keySet()) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, s)) a.add(r); else break;
}
return a;
}
@ -165,13 +167,13 @@ public class GeonamesLocalization implements Localization {
public String nickname() {
return this.file.getName();
}
public int hashCode() {
return this.nickname().hashCode();
return nickname().hashCode();
}
public boolean equals(Object other) {
public boolean equals(final Object other) {
if (!(other instanceof Localization)) return false;
return this.nickname().equals(((Localization) other).nickname());
return nickname().equals(((Localization) other).nickname());
}
}

@ -2,19 +2,19 @@
* Localization.java
* Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 16.05.2010 on http://yacy.net
*
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -32,13 +32,13 @@ import java.util.TreeSet;
*
*/
public interface Localization {
/**
* the number of locations that this localization stores
* @return the number of locations
*/
public int locations();
/**
* find a location by name
* @param anyname - a name of a location
@ -53,19 +53,25 @@ public interface Localization {
* @return a set of names that match with the given name using the local dictionary of names
*/
public Set<String> recommend(String s);
/**
* recommend a set of names according to a given name
* @param s a possibly partially matching name
* @return a set of names that match with the given name using the local dictionary of names
*/
public Set<StringBuilder> recommend(StringBuilder s);
/**
* return an nickname of the localization service
* @return the nickname
*/
public String nickname();
/**
* hashCode that must be used to distinguish localization services in hash sets
* @return the hash code, may be derived from the nickname
*/
public int hashCode();
/**
* compare localization services; to be used for hash sets with localization services
* @param other

@ -2,19 +2,19 @@
* OpenGeoDBLocalization
* Copyright 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 04.10.2009 on http://yacy.net
*
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -28,12 +28,10 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
@ -41,6 +39,7 @@ import java.util.TreeMap;
import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import net.yacy.document.StringBuilderComparator;
import net.yacy.kelondro.logging.Log;
@ -49,42 +48,35 @@ import net.yacy.kelondro.logging.Log;
* files can be loaded from http://sourceforge.net/projects/opengeodb/files/
* this class is used by the LibraryProvider, which expects input files inside
* DATA\DICTIONARIES\source
*
*
* ATTENTION:
* if this class is used, expect an extra memory usage of more than 100MB!
*
*
* This class will provide a super-fast access to the OpenGeoDB,
* since all request are evaluated using data in the RAM.
*/
public class OpenGeoDBLocalization implements Localization {
// use a collator to relax when distinguishing between lowercase und uppercase letters
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
static {
insensitiveCollator.setStrength(Collator.SECONDARY);
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
private final Map<Integer, String> locTypeHash2locType;
private final Map<Integer, Location> id2loc;
private final Map<Integer, Integer> id2locTypeHash;
private final TreeMap<String, List<Integer>> name2ids;
private final Map<String, List<Integer>> kfz2ids;
private final TreeMap<StringBuilder, List<Integer>> name2ids;
private final TreeMap<StringBuilder, List<Integer>> kfz2ids;
private final Map<String, List<Integer>> predial2ids;
private final Map<String, Integer> zip2id;
private final File file;
public OpenGeoDBLocalization(final File file, boolean lonlat) {
public OpenGeoDBLocalization(final File file, final boolean lonlat) {
this.file = file;
this.locTypeHash2locType = new HashMap<Integer, String>();
this.id2loc = new HashMap<Integer, Location>();
this.id2locTypeHash = new HashMap<Integer, Integer>();
this.name2ids = new TreeMap<String, List<Integer>>(insensitiveCollator);
this.kfz2ids = new TreeMap<String, List<Integer>>(insensitiveCollator);
this.name2ids = new TreeMap<StringBuilder, List<Integer>>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
this.kfz2ids = new TreeMap<StringBuilder, List<Integer>>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
this.predial2ids = new HashMap<String, List<Integer>>();
this.zip2id = new HashMap<String, Integer>();
if (file == null || !file.exists()) return;
BufferedReader reader = null;
try {
@ -92,7 +84,7 @@ public class OpenGeoDBLocalization implements Localization {
if (file.getName().endsWith(".gz")) is = new GZIPInputStream(is);
reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String line;
// read lines
String[] v;
Integer id;
@ -114,7 +106,7 @@ public class OpenGeoDBLocalization implements Localization {
lat = Float.parseFloat(v[2]);
lon = Float.parseFloat(v[3]);
}
id2loc.put(Integer.parseInt(v[0]), new Location(lon, lat));
this.id2loc.put(Integer.parseInt(v[0]), new Location(lon, lat));
}
if (line.startsWith("geodb_textdata ")) {
line = line.substring(15 + 7);
@ -125,8 +117,8 @@ public class OpenGeoDBLocalization implements Localization {
List<Integer> l = this.name2ids.get(h);
if (l == null) l = new ArrayList<Integer>(1);
l.add(id);
this.name2ids.put(h, l);
Location loc = this.id2loc.get(id);
this.name2ids.put(new StringBuilder(h), l);
final Location loc = this.id2loc.get(id);
if (loc != null) loc.setName(h);
} else if (v[1].equals("500400000")) { // Vorwahl
id = Integer.parseInt(v[0]);
@ -138,8 +130,8 @@ public class OpenGeoDBLocalization implements Localization {
} else if (v[1].equals("400300000")) { // Ortstyp
id = Integer.parseInt(v[0]);
h = removeQuotes(v[2]);
Integer hc = h.hashCode();
String t = this.locTypeHash2locType.get(hc);
final Integer hc = h.hashCode();
final String t = this.locTypeHash2locType.get(hc);
if (t == null) this.locTypeHash2locType.put(hc, h);
this.id2locTypeHash.put(id, hc);
} else if (v[1].equals("500300000")) { // PLZ
@ -150,7 +142,7 @@ public class OpenGeoDBLocalization implements Localization {
List<Integer> l = this.kfz2ids.get(h);
if (l == null) l = new ArrayList<Integer>(1);
l.add(id);
this.kfz2ids.put(h, l);
this.kfz2ids.put(new StringBuilder(h), l);
}
}
continue;
@ -162,7 +154,7 @@ public class OpenGeoDBLocalization implements Localization {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
}
private static final String removeQuotes(String s) {
if (s.length() > 0 && s.charAt(0) != '\'') return s;
if (s.charAt(s.length() - 1) != '\'') return s;
@ -171,9 +163,9 @@ public class OpenGeoDBLocalization implements Localization {
}
public int locations() {
return id2loc.size();
return this.id2loc.size();
}
/**
* check database tables against occurrences of this entity
* the anyname - String may be one of:
@ -184,55 +176,65 @@ public class OpenGeoDBLocalization implements Localization {
* @param anyname
* @return
*/
public TreeSet<Location> find(String anyname, boolean locationexact) {
HashSet<Integer> r = new HashSet<Integer>();
public TreeSet<Location> find(final String anyname, final boolean locationexact) {
final HashSet<Integer> r = new HashSet<Integer>();
List<Integer> c;
final StringBuilder an = new StringBuilder(anyname);
if (locationexact) {
c = this.name2ids.get(anyname); if (c != null) r.addAll(c);
c = this.name2ids.get(an); if (c != null) r.addAll(c);
} else {
SortedMap<String, List<Integer>> cities = this.name2ids.tailMap(anyname);
for (Map.Entry<String, List<Integer>> e: cities.entrySet()) {
if (e.getKey().toLowerCase().startsWith(anyname.toLowerCase())) r.addAll(e.getValue()); else break;
final SortedMap<StringBuilder, List<Integer>> cities = this.name2ids.tailMap(an);
for (final Map.Entry<StringBuilder, List<Integer>> e: cities.entrySet()) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(e.getKey(), an)) r.addAll(e.getValue()); else break;
}
c = this.kfz2ids.get(anyname); if (c != null) r.addAll(c);
c = this.kfz2ids.get(an); if (c != null) r.addAll(c);
c = this.predial2ids.get(anyname); if (c != null) r.addAll(c);
Integer i = this.zip2id.get(anyname); if (i != null) r.add(i);
final Integer i = this.zip2id.get(anyname); if (i != null) r.add(i);
}
TreeSet<Location> a = new TreeSet<Location>();
for (Integer e: r) {
Location w = this.id2loc.get(e);
final TreeSet<Location> a = new TreeSet<Location>();
for (final Integer e: r) {
final Location w = this.id2loc.get(e);
if (w != null) a.add(w);
}
return a;
}
/**
* read the dictionary and construct a set of recommendations to a given string
* read the dictionary and construct a set of recommendations to a given string
* @param s input value that is used to match recommendations
* @return a set that contains all words that start with the input value
*/
public Set<String> recommend(String s) {
Set<String> a = new HashSet<String>();
s = s.trim().toLowerCase();
public Set<String> recommend(final String s) {
final Set<String> a = new HashSet<String>();
final StringBuilder an = new StringBuilder(s);
if (s.length() == 0) return a;
final SortedMap<StringBuilder, List<Integer>> t = this.name2ids.tailMap(an);
for (final StringBuilder r: t.keySet()) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, an)) a.add(r.toString()); else break;
}
return a;
}
public Set<StringBuilder> recommend(final StringBuilder s) {
final Set<StringBuilder> a = new HashSet<StringBuilder>();
if (s.length() == 0) return a;
SortedMap<String, List<Integer>> t = this.name2ids.tailMap(s);
for (String r: t.keySet()) {
r = r.toLowerCase();
if (r.startsWith(s)) a.add(r); else break;
final SortedMap<StringBuilder, List<Integer>> t = this.name2ids.tailMap(s);
for (final StringBuilder r: t.keySet()) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, s)) a.add(r); else break;
}
return a;
}
public String nickname() {
return this.file.getName();
}
public int hashCode() {
return this.nickname().hashCode();
return nickname().hashCode();
}
public boolean equals(Object other) {
public boolean equals(final Object other) {
if (!(other instanceof Localization)) return false;
return this.nickname().equals(((Localization) other).nickname());
return nickname().equals(((Localization) other).nickname());
}
}

@ -2,19 +2,19 @@
* OverarchingLocalization.java
* Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 16.05.2010 on http://yacy.net
*
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -30,46 +30,46 @@ import java.util.TreeSet;
public class OverarchingLocalization implements Localization {
private Map<String, Localization> services;
private final Map<String, Localization> services;
/**
* create a new overarching localization object
*/
public OverarchingLocalization() {
this.services = new HashMap<String, Localization>();
}
/**
* add a localization service
* @param nickname the nickname of the service
* @param service the service
*/
public void addLocalization(String nickname, Localization service) {
public void addLocalization(final String nickname, final Localization service) {
this.services.put(nickname, service);
}
/**
* remove a localization service
* @param nickname
*/
public void removeLocalization(String nickname) {
public void removeLocalization(final String nickname) {
this.services.remove(nickname);
}
public int locations() {
int locations = 0;
for (Localization service: this.services.values()) {
for (final Localization service: this.services.values()) {
locations += service.locations();
}
return locations;
}
/**
* find (a set of) locations
*/
public TreeSet<Location> find(String anyname, boolean locationexact) {
TreeSet<Location> locations = new TreeSet<Location>();
for (Localization service: this.services.values()) {
public TreeSet<Location> find(final String anyname, final boolean locationexact) {
final TreeSet<Location> locations = new TreeSet<Location>();
for (final Localization service: this.services.values()) {
locations.addAll(service.find(anyname, locationexact));
}
return locations;
@ -78,10 +78,19 @@ public class OverarchingLocalization implements Localization {
/**
* recommend location names
*/
public Set<String> recommend(String s) {
Set<String> recommendations = new HashSet<String>();
public Set<String> recommend(final String s) {
final Set<String> recommendations = new HashSet<String>();
if (s.length() == 0) return recommendations;
for (Localization service: this.services.values()) {
for (final Localization service: this.services.values()) {
recommendations.addAll(service.recommend(s));
}
return recommendations;
}
public Set<StringBuilder> recommend(final StringBuilder s) {
final Set<StringBuilder> recommendations = new HashSet<StringBuilder>();
if (s.length() == 0) return recommendations;
for (final Localization service: this.services.values()) {
recommendations.addAll(service.recommend(s));
}
return recommendations;
@ -90,14 +99,14 @@ public class OverarchingLocalization implements Localization {
public String nickname() {
return "oa";
}
public int hashCode() {
return this.nickname().hashCode();
return nickname().hashCode();
}
public boolean equals(Object other) {
public boolean equals(final Object other) {
if (!(other instanceof Localization)) return false;
return this.nickname().equals(((Localization) other).nickname());
return nickname().equals(((Localization) other).nickname());
}
}

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -30,6 +30,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Set;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.document.LargeNumberCache;
@ -59,7 +60,7 @@ public class Word {
static {
try {
hashCache = new ConcurrentARC<String, byte[]>(hashCacheSize, Math.max(32, 4 * Runtime.getRuntime().availableProcessors()));
} catch (OutOfMemoryError e) {
} catch (final OutOfMemoryError e) {
hashCache = new ConcurrentARC<String, byte[]>(1000, Math.max(8, 2 * Runtime.getRuntime().availableProcessors()));
}
}
@ -69,7 +70,7 @@ public class Word {
hashCache = new ConcurrentHashMap<String, byte[]>();
}
*/
// object carries statistics for words and sentences
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
@ -88,29 +89,28 @@ public class Word {
}
public void inc() {
count++;
this.count++;
}
public int occurrences() {
return count;
return this.count;
}
public void check(final int i) {
phrases.add(LargeNumberCache.valueOf(i));
this.phrases.add(LargeNumberCache.valueOf(i));
}
public Iterator<Integer> phrases() {
// returns an iterator to handles of all phrases where the word appears
return phrases.iterator();
return this.phrases.iterator();
}
@Override
public String toString() {
// this is here for debugging
return "{count=" + count + ", posInText=" + posInText + ", posInPhrase=" + posInPhrase + ", numOfPhrase=" + numOfPhrase + "}";
return "{count=" + this.count + ", posInText=" + this.posInText + ", posInPhrase=" + this.posInPhrase + ", numOfPhrase=" + this.numOfPhrase + "}";
}
// static methods
public static byte[] word2hash(final StringBuilder word) {
return word2hash(word.toString());
@ -118,7 +118,7 @@ public class Word {
// create a word hash
public static final byte[] word2hash(final String word) {
String wordlc = word.toLowerCase(Locale.ENGLISH);
final String wordlc = word.toLowerCase(Locale.ENGLISH);
byte[] h = hashCache.get(wordlc);
if (h != null) return h;
// calculate the hash
@ -132,25 +132,25 @@ public class Word {
}
return h;
}
public static final HandleSet words2hashesHandles(final Set<String> words) {
final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size());
for (final String word: words)
try {
hashes.put(word2hash(word));
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
return hashes;
}
return hashes;
}
public static final HandleSet words2hashesHandles(final String[] words) {
final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.length);
for (final String word: words)
try {
hashes.put(word2hash(word));
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
return hashes;
}

@ -59,7 +59,11 @@ public class AccessTracker {
private static void add(final LinkedList<QueryParams> list, final QueryParams query) {
// learn that this word can be a word completion for the DidYouMeanLibrary
if (query.resultcount > 10 && query.queryString != null && query.queryString.length() > 0) WordCache.learn(query.queryString);
if (query.resultcount > 10 && query.queryString != null && query.queryString.length() > 0) {
final StringBuilder sb = new StringBuilder(query.queryString);
sb.append(query.queryString);
WordCache.learn(sb);
}
// add query to statistics list
list.add(query);

Loading…
Cancel
Save