// kelondroMSetTools.java // ------------------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 28.12.2004 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.kelondro.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.logging.Log; public final class SetTools { //public static Comparator fastStringComparator = fastStringComparator(true); // ------------------------------------------------------------------------------------------------ // helper methods public static int log2a(int x) { // this computes 1 + log2 // it is the number of bits in x, not the logarithm by 2 int l = 0; while (x > 0) {x = x >>> 1; l++;} return l; } // ------------------------------------------------------------------------------------------------ // join // We distinguish two principal solutions // - constructive join (generate new data structure) // - destructive join (remove non-valid elements from given data structure) // The algorithm to perform the join can be also of two kind: // - join by pairwise enumeration // - join by iterative tests (where we distinguish left-right and right-left tests) public static SortedMap joinConstructive(final Collection> maps, final boolean concatStrings) { // this joins all TreeMap(s) contained in maps // first order entities by their size final SortedMap> orderMap = new TreeMap>(); SortedMap singleMap; final Iterator> i = maps.iterator(); int count = 0; while (i.hasNext()) { // get next entity: singleMap = i.next(); // check result if ((singleMap == null) || (singleMap.isEmpty())) return new TreeMap(); // store result in order of result size orderMap.put(Long.valueOf(singleMap.size() * 1000 + count), singleMap); count++; } // check if there is any result if (orderMap.isEmpty()) return new TreeMap(); // we now must pairwise build up a conjunction of these maps Long k = orderMap.firstKey(); // the smallest, which means, the one with the least entries SortedMap mapA, mapB, joinResult = orderMap.remove(k); while (!orderMap.isEmpty() && !joinResult.isEmpty()) { // take the first element of map which is a result and combine it with result k = orderMap.firstKey(); // the next smallest... mapA = joinResult; mapB = orderMap.remove(k); joinResult = joinConstructiveByTest(mapA, mapB, concatStrings); // TODO: better with enumeration? // free resources mapA = null; mapB = null; } // in 'searchResult' is now the combined search result if (joinResult.isEmpty()) return new TreeMap(); return joinResult; } public static SortedMap joinConstructive(final SortedMap map1, final SortedMap map2, final boolean concatStrings) { // comparators must be equal if ((map1 == null) || (map2 == null)) return null; if (map1.comparator() != map2.comparator()) return null; if (map1.isEmpty() || map2.isEmpty()) return new TreeMap(map1.comparator()); // decide which method to use final int high = ((map1.size() > map2.size()) ? map1.size() : map2.size()); final int low = ((map1.size() > map2.size()) ? map2.size() : map1.size()); final int stepsEnum = 10 * (high + low - 1); final int stepsTest = 12 * log2a(high) * low; // start most efficient method if (stepsEnum > stepsTest) { if (map1.size() > map2.size()) return joinConstructiveByTest(map2, map1, concatStrings); return joinConstructiveByTest(map1, map2, concatStrings); } return joinConstructiveByEnumeration(map1, map2, concatStrings); } @SuppressWarnings("unchecked") private static SortedMap joinConstructiveByTest(final SortedMap small, final SortedMap large, final boolean concatStrings) { final SortedMap result = new TreeMap(large.comparator()); synchronized (small) { final Iterator> mi = small.entrySet().iterator(); Map.Entry mentry1; B mobj2; loop: while (mi.hasNext()) { try { mentry1 = mi.next(); synchronized (large) { mobj2 = large.get(mentry1.getKey()); } if (mobj2 != null) { if (mentry1.getValue() instanceof String) { result.put(mentry1.getKey(), (B) ((concatStrings) ? (mentry1.getValue() + (String) mobj2) : mentry1.getValue())); } else { result.put(mentry1.getKey(), mentry1.getValue()); } } } catch (ConcurrentModificationException e) { Log.logWarning("SetTools", e.getMessage(), e); break loop; } } } return result; } @SuppressWarnings("unchecked") private static SortedMap joinConstructiveByEnumeration(final SortedMap map1, final SortedMap map2, final boolean concatStrings) { // implement pairwise enumeration final Comparator comp = map1.comparator(); final Iterator> mi1 = map1.entrySet().iterator(); final Iterator> mi2 = map2.entrySet().iterator(); final SortedMap result = new TreeMap(map1.comparator()); int c; if ((mi1.hasNext()) && (mi2.hasNext())) { Map.Entry mentry1 = mi1.next(); Map.Entry mentry2 = mi2.next(); while (true) { c = comp.compare(mentry1.getKey(), mentry2.getKey()); if (c < 0) { if (mi1.hasNext()) mentry1 = mi1.next(); else break; } else if (c > 0) { if (mi2.hasNext()) mentry2 = mi2.next(); else break; } else { if (mentry1.getValue() instanceof String) { result.put(mentry1.getKey(), (B) ((concatStrings) ? ((String) mentry1.getValue() + (String) mentry2.getValue()) : (String) mentry1.getValue())); } else { result.put(mentry1.getKey(), mentry1.getValue()); } if (mi1.hasNext()) mentry1 = mi1.next(); else break; if (mi2.hasNext()) mentry2 = mi2.next(); else break; } } } return result; } // now the same for set-set public static SortedSet joinConstructive(final SortedSet set1, final SortedSet set2) { // comparators must be equal if ((set1 == null) || (set2 == null)) return null; if (set1.comparator() != set2.comparator()) return null; if (set1.isEmpty() || set2.isEmpty()) return new TreeSet(set1.comparator()); // decide which method to use final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size()); final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size()); final int stepsEnum = 10 * (high + low - 1); final int stepsTest = 12 * log2a(high) * low; // start most efficient method if (stepsEnum > stepsTest) { if (set1.size() < set2.size()) return joinConstructiveByTest(set1, set2); return joinConstructiveByTest(set2, set1); } return joinConstructiveByEnumeration(set1, set2); } private static SortedSet joinConstructiveByTest(final SortedSet small, final SortedSet large) { final Iterator mi = small.iterator(); final SortedSet result = new TreeSet(small.comparator()); A o; while (mi.hasNext()) { o = mi.next(); if (large.contains(o)) result.add(o); } return result; } private static SortedSet joinConstructiveByEnumeration(final SortedSet set1, final SortedSet set2) { // implement pairwise enumeration final Comparator comp = set1.comparator(); final Iterator mi = set1.iterator(); final Iterator si = set2.iterator(); final SortedSet result = new TreeSet(set1.comparator()); int c; if ((mi.hasNext()) && (si.hasNext())) { A mobj = mi.next(); A sobj = si.next(); while (true) { c = comp.compare(mobj, sobj); if (c < 0) { if (mi.hasNext()) mobj = mi.next(); else break; } else if (c > 0) { if (si.hasNext()) sobj = si.next(); else break; } else { result.add(mobj); if (mi.hasNext()) mobj = mi.next(); else break; if (si.hasNext()) sobj = si.next(); else break; } } } return result; } /** * test if one set is totally included in another set * @param * @param small * @param large * @return true if the small set is completely included in the large set */ public static boolean totalInclusion(final Set small, final Set large) { for (A o: small) { if (!large.contains(o)) return false; } return true; } /** * test if one set is totally included in another set * @param small * @param large * @return true if the small set is completely included in the large set */ public static boolean totalInclusion(final HandleSet small, final HandleSet large) { for (byte[] handle: small) { if (!large.has(handle)) return false; } return true; } /** * test if the intersection of two sets is not empty * @param * @param set1 * @param set2 * @return true if any element of the first set is part of the second set or vice-versa */ public static boolean anymatch(final SortedSet set1, final SortedSet set2) { // comparators must be equal if ((set1 == null) || (set2 == null)) return false; if (set1.comparator() != set2.comparator()) return false; if (set1.isEmpty() || set2.isEmpty()) return false; // decide which method to use final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size()); final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size()); final int stepsEnum = 10 * (high + low - 1); final int stepsTest = 12 * log2a(high) * low; // start most efficient method if (stepsEnum > stepsTest) { if (set1.size() < set2.size()) return anymatchByTest(set1, set2); return anymatchByTest(set2, set1); } return anymatchByEnumeration(set1, set2); } /** * test if the intersection of two sets is not empty * @param set1 * @param set2 * @return true if any element of the first set is part of the second set or vice-versa */ public static boolean anymatch(final HandleSet set1, final HandleSet set2) { // comparators must be equal if ((set1 == null) || (set2 == null)) return false; if (set1.comparator() != set2.comparator()) return false; if (set1.isEmpty() || set2.isEmpty()) return false; // decide which method to use final int high = ((set1.size() > set2.size()) ? set1.size() : set2.size()); final int low = ((set1.size() > set2.size()) ? set2.size() : set1.size()); final int stepsEnum = 10 * (high + low - 1); final int stepsTest = 12 * log2a(high) * low; // start most efficient method if (stepsEnum > stepsTest) { if (set1.size() < set2.size()) return anymatchByTest(set1, set2); return anymatchByTest(set2, set1); } return anymatchByEnumeration(set1, set2); } private static boolean anymatchByTest(final SortedSet small, final SortedSet large) { final Iterator mi = small.iterator(); A o; while (mi.hasNext()) { o = mi.next(); if (large.contains(o)) return true; } return false; } private static boolean anymatchByTest(final HandleSet small, final HandleSet large) { final Iterator mi = small.iterator(); byte[] o; while (mi.hasNext()) { o = mi.next(); if (large.has(o)) return true; } return false; } private static boolean anymatchByEnumeration(final SortedSet set1, final SortedSet set2) { // implement pairwise enumeration final Comparator comp = set1.comparator(); final Iterator mi = set1.iterator(); final Iterator si = set2.iterator(); int c; if ((mi.hasNext()) && (si.hasNext())) { A mobj = mi.next(); A sobj = si.next(); while (true) { c = comp.compare(mobj, sobj); if (c < 0) { if (mi.hasNext()) mobj = mi.next(); else break; } else if (c > 0) { if (si.hasNext()) sobj = si.next(); else break; } else { return true; } } } return false; } private static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) { // implement pairwise enumeration final Comparator comp = set1.comparator(); final Iterator mi = set1.iterator(); final Iterator si = set2.iterator(); int c; if ((mi.hasNext()) && (si.hasNext())) { byte[] mobj = mi.next(); byte[] sobj = si.next(); while (true) { c = comp.compare(mobj, sobj); if (c < 0) { if (mi.hasNext()) mobj = mi.next(); else break; } else if (c > 0) { if (si.hasNext()) sobj = si.next(); else break; } else { return true; } } } return false; } // ------------------------------------------------------------------------------------------------ // exclude /* public static TreeMap excludeConstructive(final TreeMap map, final Set set) { if (map == null) return null; if (set == null) return map; if (map.isEmpty() || set.isEmpty()) return map; assert !(set instanceof TreeSet) || map.comparator() == ((TreeSet) set).comparator(); // if (map.comparator() != set.comparator()) return excludeConstructiveByTestMapInSet(map, set); return excludeConstructiveByTestMapInSet(map, set); // return excludeConstructiveByEnumeration(map, set); } private static TreeMap excludeConstructiveByTestMapInSet(final TreeMap map, final Set set) { final TreeMap result = new TreeMap(map.comparator()); A o; for (Entry entry: map.entrySet()) { o = entry.getKey(); if (!(set.contains(o))) result.put(o, entry.getValue()); } return result; } */ public static void excludeDestructive(final Map map, final Set set) { // comparators must be equal if (map == null) return; if (set == null) return; assert !(map instanceof SortedMap && set instanceof SortedSet) || ((SortedMap) map).comparator() == ((SortedSet) set).comparator(); if (map.isEmpty() || set.isEmpty()) return; if (map.size() < set.size()) excludeDestructiveByTestMapInSet(map, set); else excludeDestructiveByTestSetInMap(map, set); } private static void excludeDestructiveByTestMapInSet(final Map map, final Set set) { final Iterator mi = map.keySet().iterator(); while (mi.hasNext()) if (set.contains(mi.next())) mi.remove(); } private static void excludeDestructiveByTestSetInMap(final Map map, final Set set) { final Iterator si = set.iterator(); while (si.hasNext()) map.remove(si.next()); } // and the same again with set-set public static void excludeDestructive(final Set set1, final Set set2) { if (set1 == null) return; if (set2 == null) return; assert !(set1 instanceof SortedSet && set2 instanceof SortedSet) || ((SortedSet) set1).comparator() == ((SortedSet) set2).comparator(); if (set1.isEmpty() || set2.isEmpty()) return; if (set1.size() < set2.size()) excludeDestructiveByTestSmallInLarge(set1, set2); else excludeDestructiveByTestLargeInSmall(set1, set2); } private static void excludeDestructiveByTestSmallInLarge(final Set small, final Set large) { final Iterator mi = small.iterator(); while (mi.hasNext()) if (large.contains(mi.next())) mi.remove(); } private static void excludeDestructiveByTestLargeInSmall(final Set large, final Set small) { final Iterator si = small.iterator(); while (si.hasNext()) large.remove(si.next()); } // ------------------------------------------------------------------------------------------------ public static SortedMap loadMap(final String filename, final String sep) { final SortedMap map = new TreeMap(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); String line; int pos; while ((line = br.readLine()) != null) { line = line.trim(); if ((line.length() > 0 && line.charAt(0) != '#') && ((pos = line.indexOf(sep)) > 0)) map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim()); } } catch (final IOException e) { } finally { if (br != null) try { br.close(); } catch (final Exception e) {} } return map; } public static SortedMap> loadMapMultiValsPerKey(final String filename, final String sep) { final SortedMap> map = new TreeMap>(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); String line, key, value; int pos; while ((line = br.readLine()) != null) { line = line.trim(); if ((line.length() > 0 && line.charAt(0) != '#') && ((pos = line.indexOf(sep)) > 0)) { key = line.substring(0, pos).trim().toLowerCase(); value = line.substring(pos + sep.length()).trim(); if (!map.containsKey(key)) map.put(key, new ArrayList()); map.get(key).add(value); } } } catch (final IOException e) { } finally { if (br != null) try { br.close(); } catch (final Exception e) {} } return map; } public static SortedSet loadList(final File file, final Comparator c) { final SortedSet list = new TreeSet(c); if (!(file.exists())) return list; BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(file))); String line; while ((line = br.readLine()) != null) { line = line.trim(); if (line.length() > 0 && line.charAt(0) != '#') list.add(line.trim().toLowerCase()); } br.close(); } catch (final IOException e) { } finally { if (br != null) try{br.close();}catch(final Exception e){} } return list; } public static String setToString(final HandleSet set, final char separator) { final Iterator i = set.iterator(); final StringBuilder sb = new StringBuilder(set.size() * 7); if (i.hasNext()) sb.append(new String(i.next())); while (i.hasNext()) { sb.append(separator).append(new String(i.next())); } return sb.toString(); } public static String setToString(final Set set, final char separator) { final Iterator i = set.iterator(); final StringBuilder sb = new StringBuilder(set.size() * 7); if (i.hasNext()) sb.append(i.next()); while (i.hasNext()) { sb.append(separator).append(i.next()); } return sb.toString(); } // ------------------------------------------------------------------------------------------------ public static void main(final String[] args) { final SortedMap m = new TreeMap(); final SortedMap s = new TreeMap(); m.put("a", "a"); m.put("x", "x"); m.put("f", "f"); m.put("h", "h"); m.put("w", "w"); m.put("7", "7"); m.put("t", "t"); m.put("k", "k"); m.put("y", "y"); m.put("z", "z"); s.put("a", "a"); s.put("b", "b"); s.put("c", "c"); s.put("k", "k"); s.put("l", "l"); s.put("m", "m"); s.put("n", "n"); s.put("o", "o"); s.put("p", "p"); s.put("q", "q"); s.put("r", "r"); s.put("s", "s"); s.put("t", "t"); s.put("x", "x"); System.out.println("Compare " + m.toString() + " with " + s.toString()); System.out.println("Join=" + joinConstructiveByEnumeration(m, s, true)); System.out.println("Join=" + joinConstructiveByTest(m, s, true)); System.out.println("Join=" + joinConstructiveByTest(m, s, true)); System.out.println("Join=" + joinConstructive(m, s, true)); //System.out.println("Exclude=" + excludeConstructiveByTestMapInSet(m, s.keySet())); /* for (int low = 0; low < 10; low++) for (int high = 0; high < 100; high=high + 10) { int stepsEnum = 10 * high; int stepsTest = 12 * log2(high) * low; System.out.println("low=" + low + ", high=" + high + ", stepsEnum=" + stepsEnum + ", stepsTest=" + stepsTest + "; best method is " + ((stepsEnum < stepsTest) ? "joinByEnumeration" : "joinByTest")); } */ } }