diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index aa66fb061..3142d35fd 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -282,13 +282,14 @@ public class ViewFile { boolean dark = true; int i = 0; - String sentence, token; + String sentence; + StringBuilder token; if (sentences != null) { // Search word highlighting for (final StringBuilder s: sentences) { sentence = s.toString(); - Enumeration tokens = null; + Enumeration tokens = null; tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib); while (tokens.hasMoreElements()) { token = tokens.nextElement(); diff --git a/htroot/suggest.java b/htroot/suggest.java index 26d04d219..7ac179525 100644 --- a/htroot/suggest.java +++ b/htroot/suggest.java @@ -31,7 +31,6 @@ import net.yacy.kelondro.data.word.Word; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; import net.yacy.search.index.Segments; - import de.anomic.data.DidYouMean; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -43,7 +42,7 @@ import de.anomic.server.servletProperties; * http://www.opensearch.org/Specifications/OpenSearch/Extensions/Suggestions/1.1 * or * https://wiki.mozilla.org/Search_Service/Suggestions - * + * * for xml format: * see Microsoft Search Suggestion Format * http://msdn.microsoft.com/en-us/library/cc848863%28VS.85%29.aspx @@ -51,9 +50,9 @@ import de.anomic.server.servletProperties; * http://msdn.microsoft.com/en-us/library/cc848862%28v=VS.85%29.aspx */ public class suggest { - + private static final int meanMax = 30; - + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final Switchboard sb = (Switchboard) env; final servletProperties prop = new servletProperties(); @@ -62,17 +61,17 @@ public class suggest { final boolean json = ext.equals("json"); final boolean xml = ext.equals("xml"); final boolean more = post != null && post.containsKey("more"); - + // get query final String originalquerystring = (post == null) ? "" : post.get("query", post.get("q", "")).trim(); final String querystring = originalquerystring.replace('+', ' '); final int timeout = (post == null) ? 300 : post.getInt("timeout", 300); final int count = (post == null) ? 20 : post.getInt("count", 20); - + // get segment final Segment indexSegment; if (post != null && post.containsKey("segment")) { - String segmentName = post.get("segment"); + final String segmentName = post.get("segment"); if (sb.indexSegments.segmentExist(segmentName)) { indexSegment = sb.indexSegments.segment(segmentName); } else { @@ -83,18 +82,18 @@ public class suggest { // take default segment indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); } - + int c = 0; if (more || (indexSegment != null && !indexSegment.termIndex().has(Word.word2hash(querystring)))) { - final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), querystring); - final Iterator meanIt = didYouMean.getSuggestions(timeout, count).iterator(); + final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring)); + final Iterator meanIt = didYouMean.getSuggestions(timeout, count).iterator(); String suggestion; //[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]] while (c < meanMax && meanIt.hasNext()) { - suggestion = meanIt.next(); + suggestion = meanIt.next().toString(); if (json) { prop.putJSON("suggestions_" + c + "_text", suggestion); } else if (xml) { @@ -106,7 +105,7 @@ public class suggest { c++; } } - + if (c > 0) { prop.put("suggestions_" + (c - 1) + "_eol", 1); } @@ -125,9 +124,9 @@ public class suggest { outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*"); prop.setOutgoingHeader(outgoingHeader); } - + // return rewrite properties return prop; } - + } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index f771145fb..2d884f5db 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -656,12 +656,12 @@ public class yacysearch { prop.put("meanCount", meanMax); if (meanMax > 0 && !json && !rss) { - final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), querystring); - final Iterator meanIt = didYouMean.getSuggestions(100, 5).iterator(); + final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring)); + final Iterator meanIt = didYouMean.getSuggestions(100, 5).iterator(); int meanCount = 0; String suggestion; while( meanCount *
  • Changing one letter: bat / cat;
  • @@ -35,7 +36,7 @@ public class DidYouMean { private static final int MinimumInputWordLength = 2; private static final int MinimumOutputWordLength = 4; - + private static final char[] ALPHABET_LATIN = { 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p', 'q','r','s','t','u','v','w','x','y','z', @@ -52,40 +53,40 @@ public class DidYouMean { } private static final char[][] ALPHABETS = {ALPHABET_LATIN, ALPHABET_KANJI}; private static char[] alphabet = ALPHABET_LATIN; - - private static final String POISON_STRING = "\n"; + + private static final StringBuilder POISON_STRING = new StringBuilder("\n"); public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors(); private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator(); - + private final IndexCell index; - private final String word; + private final StringBuilder word; private final int wordLen; - private final LinkedBlockingQueue guessGen, guessLib; + private final LinkedBlockingQueue guessGen, guessLib; private long timeLimit; private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written - private final SortedSet resultSet; + private final SortedSet resultSet; private final indexSizeComparator INDEX_SIZE_COMPARATOR; - - + + /** * @param index a termIndex - most likely retrieved from a switchboard object. * @param sort true/false - sorts the resulting TreeSet by index.count(); Warning: this causes heavy i/o. */ - public DidYouMean(final IndexCell index, final String word0) { - this.resultSet = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR))); - this.word = word0.toLowerCase(); - this.wordLen = word.length(); + public DidYouMean(final IndexCell index, final StringBuilder word0) { + this.resultSet = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR))); + this.word = word0; + this.wordLen = this.word.length(); this.index = index; - this.guessGen = new LinkedBlockingQueue(); - this.guessLib = new LinkedBlockingQueue(); + this.guessGen = new LinkedBlockingQueue(); + this.guessLib = new LinkedBlockingQueue(); this.createGen = true; this.INDEX_SIZE_COMPARATOR = new indexSizeComparator(); - + // identify language if (this.word.length() > 0) { - char testchar = this.word.charAt(0); + final char testchar = this.word.charAt(0); boolean alphafound = false; - alphatest: for (char[] alpha: ALPHABETS) { + alphatest: for (final char[] alpha: ALPHABETS) { if (isAlphabet(alpha, testchar)) { alphabet = alpha; alphafound = true; @@ -94,8 +95,8 @@ public class DidYouMean { } if (!alphafound) { // generate generic alphabet using simply a character block of 256 characters - char firstchar = (char) ((0xff & (testchar / 256)) * 256); - char lastchar = (char) (firstchar + 255); + final char firstchar = (char) ((0xff & (testchar / 256)) * 256); + final char lastchar = (char) (firstchar + 255); alphabet = new char[256]; for (char a = firstchar; a <= lastchar; a++) { alphabet[0xff & (a - firstchar)] = a; @@ -103,18 +104,18 @@ public class DidYouMean { } } } - + private static final boolean isAlphabet(final char[] alpha, final char testchar) { for (final char a: alpha) if (a == testchar) return true; return false; } - + public void reset() { this.resultSet.clear(); this.guessGen.clear(); this.guessLib.clear(); } - + /** * get suggestions for a given word. The result is first ordered using a term size ordering, * and a subset of the result is sorted again with a IO-intensive order based on the index size @@ -123,29 +124,29 @@ public class DidYouMean { * @param preSortSelection the number of words that participate in the IO-intensive sort * @return */ - public SortedSet getSuggestions(final long timeout, final int preSortSelection) { + public SortedSet getSuggestions(final long timeout, final int preSortSelection) { if (this.word.length() < MinimumInputWordLength) return this.resultSet; // return nothing if input is too short final long startTime = System.currentTimeMillis(); final long timelimit = startTime + timeout; - if (this.word.indexOf(' ') > 0) return getSuggestions(this.word.split(" "), timeout, preSortSelection, this.index); - final SortedSet preSorted = getSuggestions(timeout); + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.indexOf(this.word, ' ') > 0) return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.index); + final SortedSet preSorted = getSuggestions(timeout); if (System.currentTimeMillis() > timelimit) { Log.logInfo("DidYouMean", "found and returned " + preSorted.size() + " unsorted suggestions (1); execution time: " + (System.currentTimeMillis() - startTime) + "ms"); return preSorted; } - - final ReversibleScoreMap scored = new ClusteredScoreMap(); - for (final String s: preSorted) { + + final ReversibleScoreMap scored = new ClusteredScoreMap(StringBuilderComparator.CASE_INSENSITIVE_ORDER); + for (final StringBuilder s: preSorted) { if (System.currentTimeMillis() > timelimit) break; if (!(scored.sizeSmaller(2 * preSortSelection))) break; - scored.inc(s, index.count(Word.word2hash(s))); + scored.inc(s, this.index.count(Word.word2hash(s))); } - final SortedSet countSorted = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR))); - final int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this + final SortedSet countSorted = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR))); + final int wc = this.index.count(Word.word2hash(this.word)); // all counts must be greater than this while (!scored.isEmpty() && countSorted.size() < preSortSelection) { - final String s = scored.getMaxKey(); - int score = scored.delete(s); + final StringBuilder s = scored.getMaxKey(); + final int score = scored.delete(s); if (s.length() >= MinimumOutputWordLength && score > wc) countSorted.add(s); if (System.currentTimeMillis() > timelimit) break; } @@ -161,7 +162,7 @@ public class DidYouMean { return countSorted; } - + /** * return a string that is a suggestion list for the list of given words * @param words @@ -170,13 +171,13 @@ public class DidYouMean { * @return */ @SuppressWarnings("unchecked") - private static SortedSet getSuggestions(final String[] words, final long timeout, final int preSortSelection, final IndexCell index) { - final SortedSet[] s = new SortedSet[words.length]; + private static SortedSet getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final IndexCell index) { + final SortedSet[] s = new SortedSet[words.length]; for (int i = 0; i < words.length; i++) { s[i] = new DidYouMean(index, words[i]).getSuggestions(timeout / words.length, preSortSelection); } // make all permutations - final SortedSet result = new TreeSet(); + final SortedSet result = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); StringBuilder sb; for (int i = 0; i < words.length; i++) { if (s[i].isEmpty()) continue; @@ -185,21 +186,21 @@ public class DidYouMean { if (j > 0) sb.append(' '); if (i == j) sb.append(s[j].first()); else sb.append(words[j]); } - result.add(sb.toString()); + result.add(sb); } return result; } - + /** * This method triggers the producer and consumer threads of the DidYouMean object. * @param word a String with a single word * @param timeout execution time in ms. * @return a Set<String> with word variations contained in term index. */ - private SortedSet getSuggestions(final long timeout) { - long startTime = System.currentTimeMillis(); + private SortedSet getSuggestions(final long timeout) { + final long startTime = System.currentTimeMillis(); this.timeLimit = startTime + timeout; - + // create one consumer thread that checks the guessLib queue // for occurrences in the index. If the producers are started next, their // results can be consumers directly @@ -208,14 +209,14 @@ public class DidYouMean { consumers[0].start(); // get a single recommendation for the word without altering the word - Set libr = LibraryProvider.dymLib.recommend(this.word); - for (final String t: libr) { + final Set libr = LibraryProvider.dymLib.recommend(this.word); + for (final StringBuilder t: libr) { if (!t.equals(this.word)) try { - createGen = false; - guessLib.put(t); - } catch (InterruptedException e) {} + this.createGen = false; + this.guessLib.put(t); + } catch (final InterruptedException e) {} } - + // create and start producers // the CPU load to create the guessed words is very low, but the testing // against the library may be CPU intensive. Since it is possible to test @@ -226,50 +227,50 @@ public class DidYouMean { producers[2] = new DeletingOneLetter(); producers[3] = new ReversingTwoConsecutiveLetters(); for (final Thread t: producers) t.start(); - + // start more consumers if there are more cores if (consumers.length > 1) for (int i = 1; i < consumers.length; i++) { consumers[i] = new Consumer(); consumers[i].start(); } - + // now decide which kind of guess is better // we take guessLib entries as long as there is any entry in it // to see if this is the case, we must wait for termination of the producer - for (final Thread t: producers) try { t.join(); } catch (InterruptedException e) {} - + for (final Thread t: producers) try { t.join(); } catch (final InterruptedException e) {} + // if there is not any entry in guessLib, then transfer all entries from the // guessGen to guessLib - if (createGen) try { + if (this.createGen) try { this.guessGen.put(POISON_STRING); - String s; + StringBuilder s; while (!(s = this.guessGen.take()).equals(POISON_STRING)) this.guessLib.put(s); - } catch (InterruptedException e) {} - + } catch (final InterruptedException e) {} + // put poison into guessLib to terminate consumers for (@SuppressWarnings("unused") final Consumer c: consumers) - try { guessLib.put(POISON_STRING); } catch (InterruptedException e) {} - + try { this.guessLib.put(POISON_STRING); } catch (final InterruptedException e) {} + // wait for termination of consumer for (final Consumer c: consumers) - try { c.join(); } catch (InterruptedException e) {} - + try { c.join(); } catch (final InterruptedException e) {} + // we don't want the given word in the result this.resultSet.remove(this.word); return this.resultSet; - + } - - private void test(final String s) throws InterruptedException { - final Set libr = LibraryProvider.dymLib.recommend(s); + + private void test(final StringBuilder s) throws InterruptedException { + final Set libr = LibraryProvider.dymLib.recommend(s); libr.addAll(LibraryProvider.geoLoc.recommend(s)); - if (!libr.isEmpty()) createGen = false; - for (final String t: libr) { - guessLib.put(t); + if (!libr.isEmpty()) this.createGen = false; + for (final StringBuilder t: libr) { + this.guessLib.put(t); } - if (createGen) { - guessGen.put(s); + if (this.createGen) { + this.guessGen.put(s); } } @@ -279,72 +280,78 @@ public class DidYouMean { * Note: the loop runs (alphabet.length * len) tests. */ public class ChangingOneLetter extends Thread { - + @Override public void run() { char m; - for (int i = 0; i < wordLen; i++) try { - m = word.charAt(i); - for (char c: alphabet) { - if (m != c) test(word.substring(0, i) + c + word.substring(i + 1)); - if (System.currentTimeMillis() > timeLimit) return; + for (int i = 0; i < DidYouMean.this.wordLen; i++) try { + m = DidYouMean.this.word.charAt(i); + for (final char c: alphabet) { + if (m != c) { + final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i + 1)); + test(ts); + } + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; } - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } } - + /** * DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term * and puts it on the blocking queue, to be 'consumed' by a consumer thread.

    * Note: the loop runs (len) tests. */ private class DeletingOneLetter extends Thread { - + @Override public void run() { - for (int i = 0; i < wordLen; i++) try { - test(word.substring(0, i) + word.substring(i+1)); - if (System.currentTimeMillis() > timeLimit) return; - } catch (InterruptedException e) {} + for (int i = 0; i < DidYouMean.this.wordLen; i++) try { + final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.substring(i + 1)); + test(ts); + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; + } catch (final InterruptedException e) {} } - + } - + /** * DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term * based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.

    * Note: the loop runs (alphabet.length * len) tests. */ private class AddingOneLetter extends Thread { - + @Override public void run() { - for (int i = 0; i <= wordLen; i++) try { + for (int i = 0; i <= DidYouMean.this.wordLen; i++) try { for (final char c: alphabet) { - test(word.substring(0, i) + c + word.substring(i)); - if (System.currentTimeMillis() > timeLimit) return; + final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i)); + test(ts); + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; } - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } } - + /** * DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term * and puts it on the blocking queue, to be 'consumed' by a consumer thread.

    * Note: the loop runs (len-1) tests. */ private class ReversingTwoConsecutiveLetters extends Thread { - + @Override public void run() { - for (int i = 0; i < wordLen - 1; i++) try { - test(word.substring(0, i) + word.charAt(i + 1) + word.charAt(i) + word.substring(i +2)); - if (System.currentTimeMillis() > timeLimit) return; - } catch (InterruptedException e) {} + for (int i = 0; i < DidYouMean.this.wordLen - 1; i++) try { + final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.charAt(i + 1)).append(DidYouMean.this.word.charAt(i)).append(DidYouMean.this.word.substring(i + 2)); + test(ts); + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; + } catch (final InterruptedException e) {} } - + } - + /** * DidYouMean's consumer thread takes a String object (term) from the blocking queue * and checks if it is contained in YaCy's RWI index. @@ -354,64 +361,64 @@ public class DidYouMean { @Override public void run() { - String s; + StringBuilder s; try { - while ((s = guessLib.take()) != POISON_STRING) { - if (s.length() >= MinimumOutputWordLength && index.has(Word.word2hash(s))) resultSet.add(s); - if (System.currentTimeMillis() > timeLimit) return; + while ((s = DidYouMean.this.guessLib.take()) != POISON_STRING) { + if (s.length() >= MinimumOutputWordLength && DidYouMean.this.index.has(Word.word2hash(s))) DidYouMean.this.resultSet.add(s); + if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return; } - } catch (InterruptedException e) {} + } catch (final InterruptedException e) {} } } - + /** * indexSizeComparator is used by DidYouMean to order terms by index.count() * Warning: this causes heavy i/o */ - private class indexSizeComparator implements Comparator { + private class indexSizeComparator implements Comparator { - public int compare(final String o1, final String o2) { - final int i1 = index.count(Word.word2hash(o1)); - final int i2 = index.count(Word.word2hash(o2)); + public int compare(final StringBuilder o1, final StringBuilder o2) { + final int i1 = DidYouMean.this.index.count(Word.word2hash(o1)); + final int i2 = DidYouMean.this.index.count(Word.word2hash(o2)); if (i1 == i2) return WORD_LENGTH_COMPARATOR.compare(o1, o2); return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result - } + } } - + /** * wordLengthComparator is used by DidYouMean to order terms by the term length * This is the default order if the indexSizeComparator is not used */ - private static class wordLengthComparator implements Comparator { + private static class wordLengthComparator implements Comparator { - public int compare(final String o1, final String o2) { + public int compare(final StringBuilder o1, final StringBuilder o2) { final int i1 = o1.length(); final int i2 = o2.length(); - if (i1 == i2) return o1.compareTo(o2); + if (i1 == i2) return StringBuilderComparator.CASE_INSENSITIVE_ORDER.compare(o1, o2); return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first } - + } /** * headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first */ - private static class headMatchingComparator implements Comparator { - private final String head; - private final Comparator secondaryComparator; - public headMatchingComparator(final String head, final Comparator secondaryComparator) { - this.head = head.toLowerCase(); + private static class headMatchingComparator implements Comparator { + private final StringBuilder head; + private final Comparator secondaryComparator; + public headMatchingComparator(final StringBuilder head, final Comparator secondaryComparator) { + this.head = head; this.secondaryComparator = secondaryComparator; } - - public int compare(final String o1, final String o2) { - boolean o1m = o1.toLowerCase().startsWith(head); - boolean o2m = o2.toLowerCase().startsWith(head); - if ((o1m && o2m) || (!o1m && !o2m)) return secondaryComparator.compare(o1, o2); + + public int compare(final StringBuilder o1, final StringBuilder o2) { + final boolean o1m = StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(o1, this.head); + final boolean o2m = StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(o2, this.head); + if ((o1m && o2m) || (!o1m && !o2m)) return this.secondaryComparator.compare(o1, o2); return o1m ? -1 : 1; } } - + } diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java index 3feef7d9b..86addacc2 100644 --- a/source/de/anomic/data/ymark/YMarkAutoTagger.java +++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java @@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle final TreeSet topwords = new TreeSet(); // final TreeMap pairs = new TreeMap(); - String token; + StringBuilder token; // StringBuilder pair = new StringBuilder(64); if(document != null) { @@ -100,7 +100,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle buffer.append(document.dc_title()); buffer.append(document.dc_description()); buffer.append(document.dc_subject(' ')); - final Enumeration tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); + final Enumeration tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); int count = 0; @@ -133,7 +133,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle if (token.length()>3) { count = word.occurrences() * 100; } - topwords.add(new YMarkTag(token, count)); + topwords.add(new YMarkTag(token.toString(), count)); } } count = 0; diff --git a/source/de/anomic/server/serverObjects.java b/source/de/anomic/server/serverObjects.java index 08a295fdb..9cf1f1fd2 100644 --- a/source/de/anomic/server/serverObjects.java +++ b/source/de/anomic/server/serverObjects.java @@ -118,6 +118,27 @@ public class serverObjects extends HashMap implements Cloneable } } + /** + * Add a key-value pair of Objects to the map. + * @param key This method will do nothing if the key is null. + * @param value The value that should be mapped to the key. + * If value is null, then the element at key + * is removed from the map. + * @return The value that was added to the map. + * @see java.util.Hashtable#insert(K, V) + */ + public void put(final String key, final StringBuilder value) { + if (key == null) { + // this does nothing + return; + } else if (value == null) { + // assigning the null value creates the same effect like removing the element + super.remove(key); + } else { + super.put(key, value.toString()); + } + } + /** * Add byte array to the map, value is kept as it is. * @param key key name as String. @@ -165,6 +186,10 @@ public class serverObjects extends HashMap implements Cloneable return put(key, toJSON(value)); } + public String putJSON(final String key, final StringBuilder value) { + return put(key, toJSON(value.toString())); + } + private static String toJSON(String value) { // value = value.replaceAll("\\", "\\\\"); value = patternDoublequote.matcher(value).replaceAll("'"); diff --git a/source/net/yacy/cora/document/UTF8.java b/source/net/yacy/cora/document/UTF8.java index 2546d787e..f7675528a 100644 --- a/source/net/yacy/cora/document/UTF8.java +++ b/source/net/yacy/cora/document/UTF8.java @@ -154,6 +154,11 @@ public class UTF8 { return s.getBytes(charset); } + public final static byte[] getBytes(final StringBuilder s) { + if (s == null) return null; + return s.toString().getBytes(charset); + } + /** * Decodes a application/x-www-form-urlencoded string using a specific * encoding scheme. @@ -179,15 +184,22 @@ public class UTF8 { int pos = 0; while (((i+2) < numChars) && (c=='%')) { final int v = Integer.parseInt(s.substring(i+1,i+3),16); - if (v < 0) throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value"); + if (v < 0) { + return s; + //throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value"); + } bytes[pos++] = (byte) v; i+= 3; if (i < numChars) c = s.charAt(i); } - if ((i < numChars) && (c=='%')) throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern"); + if ((i < numChars) && (c=='%')) { + return s; + //throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern"); + } sb.append(new String(bytes, 0, pos, charset)); } catch (final NumberFormatException e) { - throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage()); + return s; + //throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage()); } needToChange = true; break; diff --git a/source/net/yacy/cora/ranking/ClusteredScoreMap.java b/source/net/yacy/cora/ranking/ClusteredScoreMap.java index b827b8c7e..f8e5870e7 100644 --- a/source/net/yacy/cora/ranking/ClusteredScoreMap.java +++ b/source/net/yacy/cora/ranking/ClusteredScoreMap.java @@ -11,12 +11,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -26,6 +26,7 @@ package net.yacy.cora.ranking; import java.text.ParseException; import java.text.SimpleDateFormat; +import java.util.Comparator; import java.util.Iterator; import java.util.Locale; import java.util.Map; @@ -37,77 +38,84 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.storage.OutOfLimitsException; public final class ClusteredScoreMap extends AbstractScoreMap implements ReversibleScoreMap { - + protected final Map map; // a mapping from a reference to the cluster key protected final TreeMap pam; // a mapping from the cluster key to the reference private long gcount; private int encnt; - + public ClusteredScoreMap() { - map = new TreeMap(); - pam = new TreeMap(); - gcount = 0; - encnt = 0; + this.map = new TreeMap(); + this.pam = new TreeMap(); + this.gcount = 0; + this.encnt = 0; + } + + public ClusteredScoreMap(final Comparator c) { + this.map = new TreeMap(c); + this.pam = new TreeMap(); + this.gcount = 0; + this.encnt = 0; } public Iterator iterator() { - return map.keySet().iterator(); + return this.map.keySet().iterator(); } - + public synchronized void clear() { - map.clear(); - pam.clear(); - gcount = 0; - encnt = 0; + this.map.clear(); + this.pam.clear(); + this.gcount = 0; + this.encnt = 0; } - + /** * shrink the cluster to a demanded size * @param maxsize */ - public void shrinkToMaxSize(int maxsize) { + public void shrinkToMaxSize(final int maxsize) { if (maxsize < 0) return; Long key; synchronized (this) { - while (map.size() > maxsize) { + while (this.map.size() > maxsize) { // find and remove smallest objects until cluster has demanded size - key = pam.firstKey(); + key = this.pam.firstKey(); if (key == null) break; - map.remove(pam.remove(key)); + this.map.remove(this.pam.remove(key)); } } } - + /** * shrink the cluster in such a way that the smallest score is equal or greater than a given minScore * @param minScore */ - public void shrinkToMinScore(int minScore) { + public void shrinkToMinScore(final int minScore) { int score; Long key; synchronized (this) { - while (pam.size() > 0) { + while (this.pam.size() > 0) { // find and remove objects where their score is smaller than the demanded minimum score - key = pam.firstKey(); + key = this.pam.firstKey(); if (key == null) break; score = (int) ((key.longValue() & 0xFFFFFFFF00000000L) >> 32); if (score >= minScore) break; - map.remove(pam.remove(key)); + this.map.remove(this.pam.remove(key)); } } } - + public static final String shortDateFormatString = "yyyyMMddHHmmss"; public static final SimpleDateFormat shortFormatter = new SimpleDateFormat(shortDateFormatString, Locale.US); public static final long minutemillis = 60000; public static long date2000 = 0; - + static { try { date2000 = shortFormatter.parse("20000101000000").getTime(); } catch (final ParseException e) {} } - + public static int object2score(Object o) { if (o instanceof Integer) return ((Integer) o).intValue(); if (o instanceof Long) { @@ -126,7 +134,7 @@ public final class ClusteredScoreMap extends AbstractScoreMap implements R String s = null; if (o instanceof String) s = (String) o; if (o instanceof byte[]) s = UTF8.String((byte[]) o); - + // this can be used to calculate a score from a string if (s == null || s.length() == 0 || s.charAt(0) == '-') return 0; try { @@ -163,7 +171,7 @@ public final class ClusteredScoreMap extends AbstractScoreMap implements R return c; } } - + private static final byte[] plainByteArray = new byte[256]; static { for (int i = 0; i < 32; i++) plainByteArray[i] = (byte) i; @@ -171,235 +179,235 @@ public final class ClusteredScoreMap extends AbstractScoreMap implements R for (int i = 96; i < 128; i++) plainByteArray[i] = (byte) (i - 64); for (int i = 128; i < 256; i++) plainByteArray[i] = (byte) (i & 0X20); } - + private long scoreKey(final int elementNr, final int elementCount) { return (((elementCount & 0xFFFFFFFFL)) << 32) | ((elementNr & 0xFFFFFFFFL)); } - + public synchronized long totalCount() { - return gcount; + return this.gcount; } - + public synchronized int size() { - return map.size(); + return this.map.size(); } - - public boolean sizeSmaller(int size) { - return map.size() < size; + + public boolean sizeSmaller(final int size) { + return this.map.size() < size; } - + public synchronized boolean isEmpty() { - return map.isEmpty(); + return this.map.isEmpty(); } - + public synchronized void inc(final E obj) { inc(obj, 1); } - + public synchronized void dec(final E obj) { inc(obj, -1); } - + public void set(final E obj, final int newScore) { if (obj == null) return; synchronized (this) { - Long usk = map.remove(obj); // get unique score key, old entry is not needed any more + Long usk = this.map.remove(obj); // get unique score key, old entry is not needed any more if (newScore < 0) throw new OutOfLimitsException(newScore); - + if (usk == null) { // set new value - usk = Long.valueOf(scoreKey(encnt++, newScore)); - + usk = Long.valueOf(scoreKey(this.encnt++, newScore)); + // put new value into cluster - map.put(obj, usk); - pam.put(usk, obj); - + this.map.put(obj, usk); + this.pam.put(usk, obj); + } else { // delete old entry - pam.remove(usk); - + this.pam.remove(usk); + // get previous handle and score final long c = usk.longValue(); final int oldScore = (int) ((c & 0xFFFFFFFF00000000L) >> 32); final int oldHandle = (int) (c & 0xFFFFFFFFL); - gcount -= oldScore; - + this.gcount -= oldScore; + // set new value usk = Long.valueOf(scoreKey(oldHandle, newScore)); // generates an unique key for a specific score - map.put(obj, usk); - pam.put(usk, obj); + this.map.put(obj, usk); + this.pam.put(usk, obj); } - } + } // increase overall counter - gcount += newScore; + this.gcount += newScore; } - + public void inc(final E obj, final int incrementScore) { if (obj == null) return; synchronized (this) { - Long usk = map.remove(obj); // get unique score key, old entry is not needed any more - + Long usk = this.map.remove(obj); // get unique score key, old entry is not needed any more + if (usk == null) { // set new value if (incrementScore < 0) throw new OutOfLimitsException(incrementScore); - usk = Long.valueOf(scoreKey(encnt++, incrementScore)); - + usk = Long.valueOf(scoreKey(this.encnt++, incrementScore)); + // put new value into cluster - map.put(obj, usk); - pam.put(usk, obj); - + this.map.put(obj, usk); + this.pam.put(usk, obj); + } else { // delete old entry - pam.remove(usk); - + this.pam.remove(usk); + // get previous handle and score final long c = usk.longValue(); final int oldScore = (int) ((c & 0xFFFFFFFF00000000L) >> 32); final int oldHandle = (int) (c & 0xFFFFFFFFL); - + // set new value final int newValue = oldScore + incrementScore; if (newValue < 0) throw new OutOfLimitsException(newValue); usk = Long.valueOf(scoreKey(oldHandle, newValue)); // generates an unique key for a specific score - map.put(obj, usk); - pam.put(usk, obj); + this.map.put(obj, usk); + this.pam.put(usk, obj); } - } + } // increase overall counter - gcount += incrementScore; + this.gcount += incrementScore; } public void dec(final E obj, final int incrementScore) { inc(obj, -incrementScore); } - + public int delete(final E obj) { // deletes entry and returns previous score if (obj == null) return 0; final Long usk; synchronized (this) { - usk = map.remove(obj); // get unique score key, old entry is not needed any more + usk = this.map.remove(obj); // get unique score key, old entry is not needed any more if (usk == null) return 0; - + // delete old entry - pam.remove(usk); + this.pam.remove(usk); } - + // get previous handle and score final int oldScore = (int) ((usk.longValue() & 0xFFFFFFFF00000000L) >> 32); // decrease overall counter - gcount -= oldScore; - - return oldScore; + this.gcount -= oldScore; + + return oldScore; } public synchronized boolean containsKey(final E obj) { - return map.containsKey(obj); + return this.map.containsKey(obj); } - + public int get(final E obj) { if (obj == null) return 0; final Long cs; synchronized (this) { - cs = map.get(obj); + cs = this.map.get(obj); } if (cs == null) return 0; return (int) ((cs.longValue() & 0xFFFFFFFF00000000L) >> 32); } - + public synchronized int getMaxScore() { - if (map.isEmpty()) return -1; - return (int) ((pam.lastKey().longValue() & 0xFFFFFFFF00000000L) >> 32); + if (this.map.isEmpty()) return -1; + return (int) ((this.pam.lastKey().longValue() & 0xFFFFFFFF00000000L) >> 32); } public synchronized int getMinScore() { - if (map.isEmpty()) return -1; - return (int) ((pam.firstKey().longValue() & 0xFFFFFFFF00000000L) >> 32); + if (this.map.isEmpty()) return -1; + return (int) ((this.pam.firstKey().longValue() & 0xFFFFFFFF00000000L) >> 32); } public synchronized E getMaxKey() { - if (map.isEmpty()) return null; - return pam.get(pam.lastKey()); + if (this.map.isEmpty()) return null; + return this.pam.get(this.pam.lastKey()); } - + public synchronized E getMinKey() { - if (map.isEmpty()) return null; - return pam.get(pam.firstKey()); + if (this.map.isEmpty()) return null; + return this.pam.get(this.pam.firstKey()); } - + public String toString() { - return map + " / " + pam; + return this.map + " / " + this.pam; } - + public synchronized Iterator keys(final boolean up) { if (up) return new simpleScoreIterator(); return new reverseScoreIterator(); } - + private class reverseScoreIterator implements Iterator { SortedMap view; Long key; - + public reverseScoreIterator() { - view = pam; + this.view = ClusteredScoreMap.this.pam; } - + public boolean hasNext() { - return !view.isEmpty(); + return !this.view.isEmpty(); } - + public E next() { - key = view.lastKey(); - view = view.headMap(key); - final E value = pam.get(key); + this.key = this.view.lastKey(); + this.view = this.view.headMap(this.key); + final E value = ClusteredScoreMap.this.pam.get(this.key); //System.out.println("cluster reverse iterator: score = " + ((((Long) key).longValue() & 0xFFFFFFFF00000000L) >> 32) + ", handle = " + (((Long) key).longValue() & 0xFFFFFFFFL) + ", value = " + value); return value; } - + public void remove() { - final Object val = pam.remove(key); - if (val != null) map.remove(val); + final Object val = ClusteredScoreMap.this.pam.remove(this.key); + if (val != null) ClusteredScoreMap.this.map.remove(val); } - + } - + private class simpleScoreIterator implements Iterator { Iterator> ii; Map.Entry entry; - + public simpleScoreIterator() { - ii = pam.entrySet().iterator(); + this.ii = ClusteredScoreMap.this.pam.entrySet().iterator(); } - + public boolean hasNext() { - return ii.hasNext(); + return this.ii.hasNext(); } - + public E next() { - entry = ii.next(); + this.entry = this.ii.next(); //System.out.println("cluster simple iterator: score = " + ((((Long) entry.getKey()).longValue() & 0xFFFFFFFF00000000L) >> 32) + ", handle = " + (((Long) entry.getKey()).longValue() & 0xFFFFFFFFL) + ", value = " + entry.getValue()); - return entry.getValue(); + return this.entry.getValue(); } - + public void remove() { - ii.remove(); - if (entry.getValue() != null) map.remove(entry.getValue()); + this.ii.remove(); + if (this.entry.getValue() != null) ClusteredScoreMap.this.map.remove(this.entry.getValue()); } - + } - + public static void main(final String[] args) { - + final String t = "ZZZZZZZZZZ"; System.out.println("score of " + t + ": " + object2score(t)); if (args.length > 0) { System.out.println("score of " + args[0] + ": " + object2score(args[0])); System.exit(0); } - + System.out.println("Test for Score: start"); final ClusteredScoreMap s = new ClusteredScoreMap(); long c = 0; @@ -409,14 +417,14 @@ public final class ClusteredScoreMap extends AbstractScoreMap implements R final Random random = new Random(1234); int r; final int count = 20; - + for (int x = 0; x < 100000; x++) { for (int i = 0; i < count; i++) { r = Math.abs(random.nextInt(100)); s.inc("score#" + r, r); c += r; } - + // delete some int p; for (int i = 0; i < (count / 2); i++) { @@ -429,13 +437,13 @@ public final class ClusteredScoreMap extends AbstractScoreMap implements R } System.out.println("finished create. time = " + (System.currentTimeMillis() - time)); - + System.out.println("result:"); Iterator i = s.keys(true); while (i.hasNext()) System.out.println("up: " + i.next()); i = s.keys(false); while (i.hasNext()) System.out.println("down: " + i.next()); - + System.out.println("total=" + s.totalCount() + ", elements=" + s.size() + ", redundant count=" + c); } } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index bb33ecaf3..424ba0845 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -7,12 +7,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -37,7 +37,6 @@ import java.util.Set; import java.util.SortedSet; import java.util.TreeMap; - import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; @@ -51,7 +50,7 @@ import net.yacy.kelondro.util.SetTools; public final class Condenser { - + // this is the page analysis class public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form public final static int wordminsize = 2; @@ -82,21 +81,21 @@ public final class Condenser { public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file - + private final static int numlength = 5; - + //private Properties analysis; - private Map words; // a string (the words) to (indexWord) - relation - + private final Map words; // a string (the words) to (indexWord) - relation + //public int RESULT_NUMB_TEXT_BYTES = -1; public int RESULT_NUMB_WORDS = -1; public int RESULT_DIFF_WORDS = -1; public int RESULT_NUMB_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1; public Bitfield RESULT_FLAGS = new Bitfield(4); - private Identificator languageIdentificator; + private final Identificator languageIdentificator; private final NumberFormat intStringFormatter = NumberFormat.getIntegerInstance(); // use a new instance for each object for a better concurrency - + public Condenser( final Document document, final boolean indexText, @@ -112,15 +111,15 @@ public final class Condenser { this.RESULT_FLAGS = new Bitfield(4); // construct flag set for document - if (!document.getImages().isEmpty()) RESULT_FLAGS.set(flag_cat_hasimage, true); - if (!document.getAudiolinks().isEmpty()) RESULT_FLAGS.set(flag_cat_hasaudio, true); - if (!document.getVideolinks().isEmpty()) RESULT_FLAGS.set(flag_cat_hasvideo, true); - if (!document.getApplinks().isEmpty()) RESULT_FLAGS.set(flag_cat_hasapp, true); - if (document.lat() != 0.0f && document.lon() != 0.0f) RESULT_FLAGS.set(flag_cat_haslocation, true); - + if (!document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); + if (!document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true); + if (!document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true); + if (!document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true); + if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true); + this.languageIdentificator = new Identificator(); - - + + Map.Entry entry; if (indexText) { assert document.getText() != null : document.dc_identifier(); @@ -137,18 +136,18 @@ public final class Condenser { // phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!) // phrase 99 is taken from the media Link url and anchor description // phrase 100 and above are lines from the text - - insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true, meaningLib); - insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib); - insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib); - insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib); - insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib); + + insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); // missing: tags! final String[] titles = document.getSectionTitles(); for (int i = 0; i < titles.length; i++) { - insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true, meaningLib); + insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib); } - + // anchors: for text indexing we add only the anchor description // REMOVED! Reason: // words from the anchor description should appear as normal text in the output from the parser @@ -169,9 +168,9 @@ public final class Condenser { this.RESULT_NUMB_SENTENCES = 0; this.RESULT_DIFF_SENTENCES = 0; } - + // add the URL components to the word list - insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false, meaningLib); + insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); if (indexMedia) { // add anchor descriptions: here, we also add the url components @@ -179,24 +178,24 @@ public final class Condenser { Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false, meaningLib); - insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true, meaningLib); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib); } // video i = document.getVideolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false, meaningLib); - insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true, meaningLib); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib); } // applications i = document.getApplinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false, meaningLib); - insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true, meaningLib); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib); } // images @@ -207,25 +206,25 @@ public final class Condenser { ientry = j.next(); url = ientry.url(); if (url == null) continue; - insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib); - insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib); + insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib); } - + // finally check all words for missing flag entry - final Iterator> k = words.entrySet().iterator(); + final Iterator> k = this.words.entrySet().iterator(); Word wprop; Map.Entry we; while (k.hasNext()) { we = k.next(); wprop = we.getValue(); if (wprop.flags == null) { - wprop.flags = RESULT_FLAGS.clone(); - words.put(we.getKey(), wprop); + wprop.flags = this.RESULT_FLAGS.clone(); + this.words.put(we.getKey(), wprop); } } } } - + private void insertTextToWords( final String text, final int phrase, @@ -241,13 +240,13 @@ public final class Condenser { int pip = 0; while (wordenum.hasMoreElements()) { word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); - if (useForLanguageIdentification) languageIdentificator.add(word); + if (useForLanguageIdentification) this.languageIdentificator.add(word); if (word.length() < 2) continue; - wprop = words.get(word); + wprop = this.words.get(word); if (wprop == null) wprop = new Word(0, pip, phrase); if (wprop.flags == null) wprop.flags = flagstemplate.clone(); wprop.flags.set(flagpos, true); - words.put(word, wprop); + this.words.put(word, wprop); pip++; this.RESULT_NUMB_WORDS++; this.RESULT_DIFF_WORDS++; @@ -257,23 +256,23 @@ public final class Condenser { public Condenser(final InputStream text, final WordCache meaningLib) { this.languageIdentificator = null; // we don't need that here // analysis = new Properties(); - words = new TreeMap(); + this.words = new TreeMap(); createCondensement(text, meaningLib); } - + public int excludeWords(final SortedSet stopwords) { // subtracts the given stopwords from the word list // the word list shrinkes. This returns the number of shrinked words - final int oldsize = words.size(); - SetTools.excludeDestructive(words, stopwords); - return oldsize - words.size(); + final int oldsize = this.words.size(); + SetTools.excludeDestructive(this.words, stopwords); + return oldsize - this.words.size(); } public Map words() { // returns the words as word/indexWord relation map - return words; + return this.words; } - + public String language() { return this.languageIdentificator.getLanguage(); } @@ -284,23 +283,24 @@ public final class Condenser { String word = ""; String k; int wordlen; - Word wsp, wsp1; + Word wsp; + final Word wsp1; int wordHandle; int wordHandleCount = 0; - int sentenceHandleCount = 0; + final int sentenceHandleCount = 0; int allwordcounter = 0; - int allsentencecounter = 0; + final int allsentencecounter = 0; int wordInSentenceCounter = 1; boolean comb_indexof = false, last_last = false, last_index = false; final Map sentences = new HashMap(100); - + // read source final WordTokenizer wordenum = new WordTokenizer(is, meaningLib); while (wordenum.hasMoreElements()) { - word = wordenum.nextElement().toLowerCase(Locale.ENGLISH); - if (languageIdentificator != null) languageIdentificator.add(word); + word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); + if (this.languageIdentificator != null) this.languageIdentificator.add(word); if (word.length() < wordminsize) continue; - + // distinguish punctuation and words wordlen = word.length(); if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { @@ -316,11 +316,11 @@ public final class Condenser { if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; last_last = word.equals("last"); last_index = word.equals("index"); - + // store word allwordcounter++; currsentwords.add(word); - wsp = words.get(word); + wsp = this.words.get(word); if (wsp != null) { // word already exists wordHandle = wsp.posInText; @@ -329,8 +329,8 @@ public final class Condenser { // word does not yet exist, create new word entry wordHandle = wordHandleCount++; wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100); - wsp.flags = RESULT_FLAGS.clone(); - words.put(word, wsp); + wsp.flags = this.RESULT_FLAGS.clone(); + this.words.put(word, wsp); } // we now have the unique handle of the word, put it into the sentence: wordInSentenceCounter++; @@ -341,7 +341,7 @@ public final class Condenser { Map.Entry entry; // we search for similar words and reorganize the corresponding sentences // a word is similar, if a shortened version is equal - final Iterator> wi = words.entrySet().iterator(); // enumerates the keys in descending order + final Iterator> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order wordsearch: while (wi.hasNext()) { entry = wi.next(); word = entry.getKey(); @@ -350,10 +350,10 @@ public final class Condenser { for (int i = wordcut; i > 0; i--) { if (wordlen > i) { k = word.substring(0, wordlen - i); - if (words.containsKey(k)) { + if (this.words.containsKey(k)) { // update word counter wsp1.count = wsp1.count + wsp.count; - words.put(k, wsp1); + this.words.put(k, wsp1); // remove current word wi.remove(); continue wordsearch; @@ -370,7 +370,7 @@ public final class Condenser { this.RESULT_NUMB_SENTENCES = allsentencecounter; this.RESULT_DIFF_SENTENCES = sentenceHandleCount; } - + public static Map getWords(final String text, final WordCache meaningLib) { // returns a word/indexWord relation map if (text == null) return null; @@ -378,7 +378,7 @@ public final class Condenser { buffer = new ByteArrayInputStream(UTF8.getBytes(text)); return new Condenser(buffer, meaningLib).words(); } - + public static void main(final String[] args) { // read a property file and convert them into configuration lines try { @@ -391,8 +391,8 @@ public final class Condenser { sb.append('"'); final String s = p.getProperty("keywords" + i); final String[] l = s.split(","); - for (int j = 0; j < l.length; j++) { - sb.append(ASCII.String(Word.word2hash(l[j]))); + for (final String element : l) { + sb.append(ASCII.String(Word.word2hash(element))); } if (i < 15) sb.append(",\n"); } @@ -403,7 +403,7 @@ public final class Condenser { } catch (final IOException e) { Log.logException(e); } - + } } diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index e89c00e11..048d7cd28 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -255,8 +255,8 @@ public class LibraryProvider { final File here = new File("dummy").getParentFile(); initialize(new File(here, "DATA/DICTIONARIES")); System.out.println("dymDict-size = " + dymLib.size()); - final Set r = dymLib.recommend("da"); - for (final String s: r) { + final Set r = dymLib.recommend(new StringBuilder("da")); + for (final StringBuilder s: r) { System.out.println("$ " + s); } System.out.println("recommendations: " + r.size()); diff --git a/source/net/yacy/document/StringBuilderComparator.java b/source/net/yacy/document/StringBuilderComparator.java new file mode 100644 index 000000000..41e77dc0b --- /dev/null +++ b/source/net/yacy/document/StringBuilderComparator.java @@ -0,0 +1,147 @@ +/** + * CaseInsensitiveStringBuilderComparator.java + * Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 09.11.2011 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.regex.Pattern; + +/** + * case-insensitive compare of two StringBuilder objects + * this shall replace the corresponding method in class String when StringBuilder objects are not transformed into string + */ +public class StringBuilderComparator implements Comparator { + + public static final StringBuilderComparator CASE_SENSITIVE_ORDER = new StringBuilderComparator(false); + public static final StringBuilderComparator CASE_INSENSITIVE_ORDER = new StringBuilderComparator(true); + + private final boolean caseInsensitive; + + public StringBuilderComparator(final boolean caseInsensitive) { + this.caseInsensitive = caseInsensitive; + } + + public int compare(final StringBuilder sb0, final StringBuilder sb1) { + final int l0 = sb0.length(); + final int l1 = sb1.length(); + final int ml = Math.min(l0, l1); + char c0, c1; + for (int i = 0; i < ml; i++) { + c0 = sb0.charAt(i); + c1 = sb1.charAt(i); + if (c0 == c1) continue; + if (this.caseInsensitive) { + c0 = Character.toUpperCase(c0); + c1 = Character.toUpperCase(c1); + if (c0 == c1) continue; + c0 = Character.toLowerCase(c0); + c1 = Character.toLowerCase(c1); + if (c0 == c1) continue; + } + return c0 - c1; + } + return l0 - l1; + } + + public boolean equals(final StringBuilder sb0, final StringBuilder sb1) { + final int l0 = sb0.length(); + final int l1 = sb1.length(); + if (l0 != l1) return false; + return equals(sb0, sb1, l1); + } + + public boolean startsWith(final StringBuilder sb0, final StringBuilder sb1) { + final int l0 = sb0.length(); + final int l1 = sb1.length(); + if (l0 < l1) return false; + return equals(sb0, sb1, l1); + } + + private boolean equals(final StringBuilder sb0, final StringBuilder sb1, final int l) { + char c0, c1; + for (int i = 0; i < l; i++) { + c0 = sb0.charAt(i); + c1 = sb1.charAt(i); + if (c0 == c1) continue; + if (this.caseInsensitive) { + c0 = Character.toUpperCase(c0); + c1 = Character.toUpperCase(c1); + if (c0 == c1) continue; + c0 = Character.toLowerCase(c0); + c1 = Character.toLowerCase(c1); + if (c0 == c1) continue; + } + return false; + } + return true; + } + + // methods that can be useful for StringBuilder as replacement of String + + public int indexOf(final StringBuilder sb, final char ch) { + final int max = sb.length(); + for (int i = 0; i < max ; i++) { + if (sb.charAt(i) == ch) return i; + } + return -1; + } + + public int indexOf(final StringBuilder sb, final int off, final char ch) { + final int max = sb.length(); + for (int i = off; i < max ; i++) { + if (sb.charAt(i) == ch) return i; + } + return -1; + } + + public StringBuilder[] split(final StringBuilder sb, final char c) { + int next = 0; + int off = 0; + final ArrayList list = new ArrayList(); + while ((next = indexOf(sb, off, c)) != -1) { + list.add(sb.substring(off, next)); + off = next + 1; + } + if (off == 0) return new StringBuilder[] { sb }; + + list.add(sb.substring(off, sb.length())); + + int resultSize = list.size(); + while (resultSize > 0 && list.get(resultSize - 1).length() == 0) resultSize--; + final StringBuilder[] result = new StringBuilder[resultSize]; + for (int i = 0; i < resultSize; i++) result[i] = new StringBuilder(list.get(i)); + return result; + } + + public static StringBuilder[] split(final StringBuilder sb, final Pattern pattern) { + final String[] p = pattern.split(sb); + final StringBuilder[] h = new StringBuilder[p.length]; + for (int i = 0; i < p.length; i++) h[i] = new StringBuilder(p[i]); + return h; + } + + public static void main(final String[] args) { + final StringBuilder s = new StringBuilder("ene mene mu"); + final StringBuilder[] t = StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(s, ' '); + for (final StringBuilder u: t) System.out.println(u.toString()); + } + +} diff --git a/source/net/yacy/document/WordCache.java b/source/net/yacy/document/WordCache.java index e45a1c1da..ebbeac1c6 100644 --- a/source/net/yacy/document/WordCache.java +++ b/source/net/yacy/document/WordCache.java @@ -49,12 +49,12 @@ public class WordCache { // common word cache private static final int commonWordsMaxSize = 100000; // maximum size of common word cache private static final int commonWordsMinLength = 5; // words must have that length at minimum - private static OrderedScoreMap commonWords = new OrderedScoreMap(String.CASE_INSENSITIVE_ORDER); + private static OrderedScoreMap commonWords = new OrderedScoreMap(StringBuilderComparator.CASE_INSENSITIVE_ORDER); // dictionaries private final File dictionaryPath; - private TreeSet dict; // the word dictionary - private TreeSet tcid; // the dictionary of reverse words + private TreeSet dict; // the word dictionary + private TreeSet tcid; // the dictionary of reverse words /** * create a new dictionary @@ -72,7 +72,7 @@ public class WordCache { * add a word to the generic dictionary * @param word */ - public static void learn(final String word) { + public static void learn(final StringBuilder word) { if (word == null) return; if (word.length() < commonWordsMinLength) return; if (MemoryControl.shortStatus()) commonWords.clear(); @@ -86,8 +86,8 @@ public class WordCache { * scan the input directory and load all dictionaries (again) */ public void reload() { - this.dict = new TreeSet(); - this.tcid = new TreeSet(); + this.dict = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); + this.tcid = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); if (this.dictionaryPath == null || !this.dictionaryPath.exists()) return; final String[] files = this.dictionaryPath.list(); for (final String f: files) { @@ -106,25 +106,27 @@ public class WordCache { } final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); String l; + StringBuilder sb; try { while ((l = reader.readLine()) != null) { if (l.length() == 0 || l.charAt(0) == '#') continue; l = l.trim().toLowerCase(); if (l.length() < 4) continue; - this.dict.add(l); - this.tcid.add(reverse(l)); + sb = new StringBuilder(l); + this.dict.add(sb); + this.tcid.add(reverse(sb)); } } catch (final IOException e) { // finish } } - private static String reverse(final String s) { + private static StringBuilder reverse(final StringBuilder s) { final StringBuilder sb = new StringBuilder(s.length()); for (int i = s.length() - 1; i >= 0; i--) { sb.append(s.charAt(i)); } - return sb.toString(); + return sb; } /** @@ -132,25 +134,24 @@ public class WordCache { * @param s input value that is used to match recommendations * @return set that contains all words that start or end with the input value */ - public Set recommend(final String s) { - final Set ret = new HashSet(); - String string = s.trim().toLowerCase(); - SortedSet t = this.dict.tailSet(string); - for (final String r: t) { - if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break; + public Set recommend(StringBuilder string) { + final Set ret = new HashSet(); + SortedSet t = this.dict.tailSet(string); + for (final StringBuilder r: t) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(r); else break; } - final SortedMap u = commonWords.tailMap(string); - String vv; + final SortedMap u = commonWords.tailMap(string); + StringBuilder vv; try { - for (final Map.Entry v: u.entrySet()) { + for (final Map.Entry v: u.entrySet()) { vv = v.getKey(); - if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break; + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(vv, string) && vv.length() > string.length()) ret.add(vv); else break; } } catch (final ConcurrentModificationException e) {} string = reverse(string); t = this.tcid.tailSet(string); - for (final String r: t) { - if (r.startsWith(string) && r.length() > string.length()) ret.add(reverse(r)); else break; + for (final StringBuilder r: t) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, string) && r.length() > string.length()) ret.add(reverse(r)); else break; } return ret; } @@ -160,8 +161,8 @@ public class WordCache { * @param s the given word * @return true if the library contains the word */ - public boolean contains(final String s) { - return this.dict.contains(s.trim().toLowerCase()); + public boolean contains(final StringBuilder s) { + return this.dict.contains(s); // if the above case is true then it is also true for this.tcid and vice versa // that means it does not need to be tested as well } @@ -173,16 +174,15 @@ public class WordCache { * @param s the given word * @return true if the library supports the word */ - public boolean supports(final String s) { - String string = s.trim().toLowerCase(); - SortedSet t = this.dict.tailSet(string); - for (final String r: t) { - if (string.startsWith(r)) return true; else break; + public boolean supports(StringBuilder string) { + SortedSet t = this.dict.tailSet(string); + for (final StringBuilder r: t) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break; } string = reverse(string); t = this.tcid.tailSet(string); - for (final String r: t) { - if (string.startsWith(r)) return true; else break; + for (final StringBuilder r: t) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(string, r)) return true; else break; } return false; } diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index e1ac29216..0ca592ade 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -37,7 +37,7 @@ import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.order.Base64Order; -public class WordTokenizer implements Enumeration { +public class WordTokenizer implements Enumeration { // this enumeration removes all words that contain either wrong characters or are too short private StringBuilder buffer = null; @@ -72,8 +72,8 @@ public class WordTokenizer implements Enumeration { return this.buffer != null; } - public String nextElement() { - final String r = (this.buffer == null) ? null : this.buffer.toString(); + public StringBuilder nextElement() { + final StringBuilder r = (this.buffer == null) ? null : this.buffer; this.buffer = nextElement0(); // put word to words statistics cache if (this.meaningLib != null) WordCache.learn(r); @@ -172,14 +172,14 @@ public class WordTokenizer implements Enumeration { */ public static SortedMap hashSentence(final String sentence, final WordCache meaningLib) { final SortedMap map = new TreeMap(Base64Order.enhancedCoder); - final Enumeration words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); + final Enumeration words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); int pos = 0; - String word; + StringBuilder word; byte[] hash; Integer oldpos; while (words.hasMoreElements()) { word = words.nextElement(); - hash = Word.word2hash(word.toString()); + hash = Word.word2hash(word); // don't overwrite old values, that leads to too far word distances oldpos = map.put(hash, LargeNumberCache.valueOf(pos)); diff --git a/source/net/yacy/document/geolocalization/GeonamesLocalization.java b/source/net/yacy/document/geolocalization/GeonamesLocalization.java index 8cdc78610..7ca149c78 100644 --- a/source/net/yacy/document/geolocalization/GeonamesLocalization.java +++ b/source/net/yacy/document/geolocalization/GeonamesLocalization.java @@ -2,19 +2,19 @@ * GeonamesLocalization.java * Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 16.05.2010 on http://yacy.net - * + * * This file is part of YaCy Content Integration * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -27,12 +27,10 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.text.Collator; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.SortedMap; @@ -41,6 +39,7 @@ import java.util.TreeSet; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; +import net.yacy.document.StringBuilderComparator; import net.yacy.kelondro.logging.Log; public class GeonamesLocalization implements Localization { @@ -59,42 +58,35 @@ public class GeonamesLocalization implements Localization { country code : ISO-3166 2-letter country code, 2 characters cc2 : alternate country codes, comma separated, ISO-3166 2-letter country code, 60 characters admin1 code : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20) - admin2 code : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) + admin2 code : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) admin3 code : code for third level administrative division, varchar(20) admin4 code : code for fourth level administrative division, varchar(20) - population : bigint (8 byte int) + population : bigint (8 byte int) elevation : in meters, integer gtopo30 : average elevation of 30'x30' (ca 900mx900m) area in meters, integer timezone : the timezone id (see file timeZone.txt) modification date : date of last modification in yyyy-MM-dd format */ - - // use a collator to relax when distinguishing between lowercase und uppercase letters - private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); - static { - insensitiveCollator.setStrength(Collator.SECONDARY); - insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); - } - + private final Map id2loc; - private final TreeMap> name2ids; + private final TreeMap> name2ids; private final File file; - + public GeonamesLocalization(final File file) { // this is a processing of the cities1000.zip file from http://download.geonames.org/export/dump/ this.file = file; this.id2loc = new HashMap(); - this.name2ids = new TreeMap>(insensitiveCollator); - + this.name2ids = new TreeMap>(StringBuilderComparator.CASE_INSENSITIVE_ORDER); + if (file == null || !file.exists()) return; BufferedReader reader; try { - ZipFile zf = new ZipFile(file); - ZipEntry ze = zf.getEntry("cities1000.txt"); - InputStream is = zf.getInputStream(ze); + final ZipFile zf = new ZipFile(file); + final ZipEntry ze = zf.getEntry("cities1000.txt"); + final InputStream is = zf.getInputStream(ze); reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); return; } @@ -103,61 +95,71 @@ public class GeonamesLocalization implements Localization { try { String line; String[] fields; - Set locnames; + Set locnames; while ((line = reader.readLine()) != null) { if (line.length() == 0) continue; fields = line.split("\t"); - int id = Integer.parseInt(fields[0]); - locnames = new HashSet(); - locnames.add(fields[1]); - locnames.add(fields[2]); - for (String s: fields[3].split(",")) locnames.add(s); - Location c = new Location(Float.parseFloat(fields[5]), Float.parseFloat(fields[4]), fields[1]); + final int id = Integer.parseInt(fields[0]); + locnames = new HashSet(); + locnames.add(new StringBuilder(fields[1])); + locnames.add(new StringBuilder(fields[2])); + for (final String s: fields[3].split(",")) locnames.add(new StringBuilder(s)); + final Location c = new Location(Float.parseFloat(fields[5]), Float.parseFloat(fields[4]), fields[1]); c.setPopulation((int) Long.parseLong(fields[14])); this.id2loc.put(id, c); - for (String name: locnames) { + for (final StringBuilder name: locnames) { List locs = this.name2ids.get(name); if (locs == null) locs = new ArrayList(1); locs.add(id); this.name2ids.put(name, locs); } } - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } } public int locations() { - return id2loc.size(); + return this.id2loc.size(); } - - public TreeSet find(String anyname, boolean locationexact) { - Set r = new HashSet(); + + public TreeSet find(final String anyname, final boolean locationexact) { + final Set r = new HashSet(); List c; + final StringBuilder an = new StringBuilder(anyname); if (locationexact) { c = this.name2ids.get(anyname); if (c != null) r.addAll(c); } else { - SortedMap> cities = this.name2ids.tailMap(anyname); - for (Map.Entry> e: cities.entrySet()) { - if (e.getKey().toLowerCase().startsWith(anyname.toLowerCase())) r.addAll(e.getValue()); else break; + final SortedMap> cities = this.name2ids.tailMap(an); + for (final Map.Entry> e: cities.entrySet()) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(e.getKey(), an)) r.addAll(e.getValue()); else break; } } - TreeSet a = new TreeSet(); - for (Integer e: r) { - Location w = this.id2loc.get(e); + final TreeSet a = new TreeSet(); + for (final Integer e: r) { + final Location w = this.id2loc.get(e); if (w != null) a.add(w); } return a; } - public Set recommend(String s) { - Set a = new HashSet(); - s = s.trim().toLowerCase(); + public Set recommend(final String s) { + final Set a = new HashSet(); + final StringBuilder an = new StringBuilder(s); if (s.length() == 0) return a; - SortedMap> t = this.name2ids.tailMap(s); - for (String r: t.keySet()) { - r = r.toLowerCase(); - if (r.startsWith(s)) a.add(r); else break; + final SortedMap> t = this.name2ids.tailMap(an); + for (final StringBuilder r: t.keySet()) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, an)) a.add(r.toString()); else break; + } + return a; + } + + public Set recommend(final StringBuilder s) { + final Set a = new HashSet(); + if (s.length() == 0) return a; + final SortedMap> t = this.name2ids.tailMap(s); + for (final StringBuilder r: t.keySet()) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, s)) a.add(r); else break; } return a; } @@ -165,13 +167,13 @@ public class GeonamesLocalization implements Localization { public String nickname() { return this.file.getName(); } - + public int hashCode() { - return this.nickname().hashCode(); + return nickname().hashCode(); } - - public boolean equals(Object other) { + + public boolean equals(final Object other) { if (!(other instanceof Localization)) return false; - return this.nickname().equals(((Localization) other).nickname()); + return nickname().equals(((Localization) other).nickname()); } } diff --git a/source/net/yacy/document/geolocalization/Localization.java b/source/net/yacy/document/geolocalization/Localization.java index ff299e127..130cee536 100644 --- a/source/net/yacy/document/geolocalization/Localization.java +++ b/source/net/yacy/document/geolocalization/Localization.java @@ -2,19 +2,19 @@ * Localization.java * Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 16.05.2010 on http://yacy.net - * + * * This file is part of YaCy Content Integration * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -32,13 +32,13 @@ import java.util.TreeSet; * */ public interface Localization { - + /** * the number of locations that this localization stores * @return the number of locations */ public int locations(); - + /** * find a location by name * @param anyname - a name of a location @@ -53,19 +53,25 @@ public interface Localization { * @return a set of names that match with the given name using the local dictionary of names */ public Set recommend(String s); - + /** + * recommend a set of names according to a given name + * @param s a possibly partially matching name + * @return a set of names that match with the given name using the local dictionary of names + */ + public Set recommend(StringBuilder s); + /** * return an nickname of the localization service * @return the nickname */ public String nickname(); - + /** * hashCode that must be used to distinguish localization services in hash sets * @return the hash code, may be derived from the nickname */ public int hashCode(); - + /** * compare localization services; to be used for hash sets with localization services * @param other diff --git a/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java b/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java index e2ecb3899..bbd0f336b 100644 --- a/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java +++ b/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java @@ -2,19 +2,19 @@ * OpenGeoDBLocalization * Copyright 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 04.10.2009 on http://yacy.net - * + * * This file is part of YaCy Content Integration * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -28,12 +28,10 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.text.Collator; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.SortedMap; @@ -41,6 +39,7 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.zip.GZIPInputStream; +import net.yacy.document.StringBuilderComparator; import net.yacy.kelondro.logging.Log; @@ -49,42 +48,35 @@ import net.yacy.kelondro.logging.Log; * files can be loaded from http://sourceforge.net/projects/opengeodb/files/ * this class is used by the LibraryProvider, which expects input files inside * DATA\DICTIONARIES\source - * + * * ATTENTION: * if this class is used, expect an extra memory usage of more than 100MB! - * + * * This class will provide a super-fast access to the OpenGeoDB, * since all request are evaluated using data in the RAM. */ public class OpenGeoDBLocalization implements Localization { - - // use a collator to relax when distinguishing between lowercase und uppercase letters - private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); - static { - insensitiveCollator.setStrength(Collator.SECONDARY); - insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); - } - + private final Map locTypeHash2locType; private final Map id2loc; private final Map id2locTypeHash; - private final TreeMap> name2ids; - private final Map> kfz2ids; + private final TreeMap> name2ids; + private final TreeMap> kfz2ids; private final Map> predial2ids; private final Map zip2id; private final File file; - - public OpenGeoDBLocalization(final File file, boolean lonlat) { + + public OpenGeoDBLocalization(final File file, final boolean lonlat) { this.file = file; this.locTypeHash2locType = new HashMap(); this.id2loc = new HashMap(); this.id2locTypeHash = new HashMap(); - this.name2ids = new TreeMap>(insensitiveCollator); - this.kfz2ids = new TreeMap>(insensitiveCollator); + this.name2ids = new TreeMap>(StringBuilderComparator.CASE_INSENSITIVE_ORDER); + this.kfz2ids = new TreeMap>(StringBuilderComparator.CASE_INSENSITIVE_ORDER); this.predial2ids = new HashMap>(); this.zip2id = new HashMap(); - + if (file == null || !file.exists()) return; BufferedReader reader = null; try { @@ -92,7 +84,7 @@ public class OpenGeoDBLocalization implements Localization { if (file.getName().endsWith(".gz")) is = new GZIPInputStream(is); reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); String line; - + // read lines String[] v; Integer id; @@ -114,7 +106,7 @@ public class OpenGeoDBLocalization implements Localization { lat = Float.parseFloat(v[2]); lon = Float.parseFloat(v[3]); } - id2loc.put(Integer.parseInt(v[0]), new Location(lon, lat)); + this.id2loc.put(Integer.parseInt(v[0]), new Location(lon, lat)); } if (line.startsWith("geodb_textdata ")) { line = line.substring(15 + 7); @@ -125,8 +117,8 @@ public class OpenGeoDBLocalization implements Localization { List l = this.name2ids.get(h); if (l == null) l = new ArrayList(1); l.add(id); - this.name2ids.put(h, l); - Location loc = this.id2loc.get(id); + this.name2ids.put(new StringBuilder(h), l); + final Location loc = this.id2loc.get(id); if (loc != null) loc.setName(h); } else if (v[1].equals("500400000")) { // Vorwahl id = Integer.parseInt(v[0]); @@ -138,8 +130,8 @@ public class OpenGeoDBLocalization implements Localization { } else if (v[1].equals("400300000")) { // Ortstyp id = Integer.parseInt(v[0]); h = removeQuotes(v[2]); - Integer hc = h.hashCode(); - String t = this.locTypeHash2locType.get(hc); + final Integer hc = h.hashCode(); + final String t = this.locTypeHash2locType.get(hc); if (t == null) this.locTypeHash2locType.put(hc, h); this.id2locTypeHash.put(id, hc); } else if (v[1].equals("500300000")) { // PLZ @@ -150,7 +142,7 @@ public class OpenGeoDBLocalization implements Localization { List l = this.kfz2ids.get(h); if (l == null) l = new ArrayList(1); l.add(id); - this.kfz2ids.put(h, l); + this.kfz2ids.put(new StringBuilder(h), l); } } continue; @@ -162,7 +154,7 @@ public class OpenGeoDBLocalization implements Localization { if (reader != null) try { reader.close(); } catch (final Exception e) {} } } - + private static final String removeQuotes(String s) { if (s.length() > 0 && s.charAt(0) != '\'') return s; if (s.charAt(s.length() - 1) != '\'') return s; @@ -171,9 +163,9 @@ public class OpenGeoDBLocalization implements Localization { } public int locations() { - return id2loc.size(); + return this.id2loc.size(); } - + /** * check database tables against occurrences of this entity * the anyname - String may be one of: @@ -184,55 +176,65 @@ public class OpenGeoDBLocalization implements Localization { * @param anyname * @return */ - public TreeSet find(String anyname, boolean locationexact) { - HashSet r = new HashSet(); + public TreeSet find(final String anyname, final boolean locationexact) { + final HashSet r = new HashSet(); List c; + final StringBuilder an = new StringBuilder(anyname); if (locationexact) { - c = this.name2ids.get(anyname); if (c != null) r.addAll(c); + c = this.name2ids.get(an); if (c != null) r.addAll(c); } else { - SortedMap> cities = this.name2ids.tailMap(anyname); - for (Map.Entry> e: cities.entrySet()) { - if (e.getKey().toLowerCase().startsWith(anyname.toLowerCase())) r.addAll(e.getValue()); else break; + final SortedMap> cities = this.name2ids.tailMap(an); + for (final Map.Entry> e: cities.entrySet()) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(e.getKey(), an)) r.addAll(e.getValue()); else break; } - c = this.kfz2ids.get(anyname); if (c != null) r.addAll(c); + c = this.kfz2ids.get(an); if (c != null) r.addAll(c); c = this.predial2ids.get(anyname); if (c != null) r.addAll(c); - Integer i = this.zip2id.get(anyname); if (i != null) r.add(i); + final Integer i = this.zip2id.get(anyname); if (i != null) r.add(i); } - TreeSet a = new TreeSet(); - for (Integer e: r) { - Location w = this.id2loc.get(e); + final TreeSet a = new TreeSet(); + for (final Integer e: r) { + final Location w = this.id2loc.get(e); if (w != null) a.add(w); } return a; } - + /** - * read the dictionary and construct a set of recommendations to a given string + * read the dictionary and construct a set of recommendations to a given string * @param s input value that is used to match recommendations * @return a set that contains all words that start with the input value */ - public Set recommend(String s) { - Set a = new HashSet(); - s = s.trim().toLowerCase(); + public Set recommend(final String s) { + final Set a = new HashSet(); + final StringBuilder an = new StringBuilder(s); + if (s.length() == 0) return a; + final SortedMap> t = this.name2ids.tailMap(an); + for (final StringBuilder r: t.keySet()) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, an)) a.add(r.toString()); else break; + } + return a; + } + + public Set recommend(final StringBuilder s) { + final Set a = new HashSet(); if (s.length() == 0) return a; - SortedMap> t = this.name2ids.tailMap(s); - for (String r: t.keySet()) { - r = r.toLowerCase(); - if (r.startsWith(s)) a.add(r); else break; + final SortedMap> t = this.name2ids.tailMap(s); + for (final StringBuilder r: t.keySet()) { + if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(r, s)) a.add(r); else break; } return a; } - + public String nickname() { return this.file.getName(); } - + public int hashCode() { - return this.nickname().hashCode(); + return nickname().hashCode(); } - - public boolean equals(Object other) { + + public boolean equals(final Object other) { if (!(other instanceof Localization)) return false; - return this.nickname().equals(((Localization) other).nickname()); + return nickname().equals(((Localization) other).nickname()); } } diff --git a/source/net/yacy/document/geolocalization/OverarchingLocalization.java b/source/net/yacy/document/geolocalization/OverarchingLocalization.java index 446d1f973..fbf704fe9 100644 --- a/source/net/yacy/document/geolocalization/OverarchingLocalization.java +++ b/source/net/yacy/document/geolocalization/OverarchingLocalization.java @@ -2,19 +2,19 @@ * OverarchingLocalization.java * Copyright 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 16.05.2010 on http://yacy.net - * + * * This file is part of YaCy Content Integration * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -30,46 +30,46 @@ import java.util.TreeSet; public class OverarchingLocalization implements Localization { - private Map services; - + private final Map services; + /** * create a new overarching localization object */ public OverarchingLocalization() { this.services = new HashMap(); } - + /** * add a localization service * @param nickname the nickname of the service * @param service the service */ - public void addLocalization(String nickname, Localization service) { + public void addLocalization(final String nickname, final Localization service) { this.services.put(nickname, service); } - + /** * remove a localization service * @param nickname */ - public void removeLocalization(String nickname) { + public void removeLocalization(final String nickname) { this.services.remove(nickname); } public int locations() { int locations = 0; - for (Localization service: this.services.values()) { + for (final Localization service: this.services.values()) { locations += service.locations(); } return locations; } - + /** * find (a set of) locations */ - public TreeSet find(String anyname, boolean locationexact) { - TreeSet locations = new TreeSet(); - for (Localization service: this.services.values()) { + public TreeSet find(final String anyname, final boolean locationexact) { + final TreeSet locations = new TreeSet(); + for (final Localization service: this.services.values()) { locations.addAll(service.find(anyname, locationexact)); } return locations; @@ -78,10 +78,19 @@ public class OverarchingLocalization implements Localization { /** * recommend location names */ - public Set recommend(String s) { - Set recommendations = new HashSet(); + public Set recommend(final String s) { + final Set recommendations = new HashSet(); if (s.length() == 0) return recommendations; - for (Localization service: this.services.values()) { + for (final Localization service: this.services.values()) { + recommendations.addAll(service.recommend(s)); + } + return recommendations; + } + + public Set recommend(final StringBuilder s) { + final Set recommendations = new HashSet(); + if (s.length() == 0) return recommendations; + for (final Localization service: this.services.values()) { recommendations.addAll(service.recommend(s)); } return recommendations; @@ -90,14 +99,14 @@ public class OverarchingLocalization implements Localization { public String nickname() { return "oa"; } - + public int hashCode() { - return this.nickname().hashCode(); + return nickname().hashCode(); } - - public boolean equals(Object other) { + + public boolean equals(final Object other) { if (!(other instanceof Localization)) return false; - return this.nickname().equals(((Localization) other).nickname()); + return nickname().equals(((Localization) other).nickname()); } } diff --git a/source/net/yacy/kelondro/data/word/Word.java b/source/net/yacy/kelondro/data/word/Word.java index e55d17d33..b90f8df89 100644 --- a/source/net/yacy/kelondro/data/word/Word.java +++ b/source/net/yacy/kelondro/data/word/Word.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -30,6 +30,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Locale; import java.util.Set; + import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ConcurrentARC; import net.yacy.document.LargeNumberCache; @@ -59,7 +60,7 @@ public class Word { static { try { hashCache = new ConcurrentARC(hashCacheSize, Math.max(32, 4 * Runtime.getRuntime().availableProcessors())); - } catch (OutOfMemoryError e) { + } catch (final OutOfMemoryError e) { hashCache = new ConcurrentARC(1000, Math.max(8, 2 * Runtime.getRuntime().availableProcessors())); } } @@ -69,7 +70,7 @@ public class Word { hashCache = new ConcurrentHashMap(); } */ - + // object carries statistics for words and sentences public int count; // number of occurrences public int posInText; // unique handle, is initialized with word position (excluding double occurring words) @@ -88,29 +89,28 @@ public class Word { } public void inc() { - count++; + this.count++; } - + public int occurrences() { - return count; + return this.count; } public void check(final int i) { - phrases.add(LargeNumberCache.valueOf(i)); + this.phrases.add(LargeNumberCache.valueOf(i)); } public Iterator phrases() { // returns an iterator to handles of all phrases where the word appears - return phrases.iterator(); + return this.phrases.iterator(); } - + @Override public String toString() { // this is here for debugging - return "{count=" + count + ", posInText=" + posInText + ", posInPhrase=" + posInPhrase + ", numOfPhrase=" + numOfPhrase + "}"; + return "{count=" + this.count + ", posInText=" + this.posInText + ", posInPhrase=" + this.posInPhrase + ", numOfPhrase=" + this.numOfPhrase + "}"; } - - + // static methods public static byte[] word2hash(final StringBuilder word) { return word2hash(word.toString()); @@ -118,7 +118,7 @@ public class Word { // create a word hash public static final byte[] word2hash(final String word) { - String wordlc = word.toLowerCase(Locale.ENGLISH); + final String wordlc = word.toLowerCase(Locale.ENGLISH); byte[] h = hashCache.get(wordlc); if (h != null) return h; // calculate the hash @@ -132,25 +132,25 @@ public class Word { } return h; } - + public static final HandleSet words2hashesHandles(final Set words) { final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size()); for (final String word: words) try { hashes.put(word2hash(word)); - } catch (RowSpaceExceededException e) { + } catch (final RowSpaceExceededException e) { Log.logException(e); return hashes; } return hashes; } - + public static final HandleSet words2hashesHandles(final String[] words) { final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.length); for (final String word: words) try { hashes.put(word2hash(word)); - } catch (RowSpaceExceededException e) { + } catch (final RowSpaceExceededException e) { Log.logException(e); return hashes; } diff --git a/source/net/yacy/search/query/AccessTracker.java b/source/net/yacy/search/query/AccessTracker.java index 1e822488f..ab15c857b 100644 --- a/source/net/yacy/search/query/AccessTracker.java +++ b/source/net/yacy/search/query/AccessTracker.java @@ -59,7 +59,11 @@ public class AccessTracker { private static void add(final LinkedList list, final QueryParams query) { // learn that this word can be a word completion for the DidYouMeanLibrary - if (query.resultcount > 10 && query.queryString != null && query.queryString.length() > 0) WordCache.learn(query.queryString); + if (query.resultcount > 10 && query.queryString != null && query.queryString.length() > 0) { + final StringBuilder sb = new StringBuilder(query.queryString); + sb.append(query.queryString); + WordCache.learn(sb); + } // add query to statistics list list.add(query);