From 3b992e6b007dbdb2f61c0938f74a5d41926aa284 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 9 Jun 2012 11:00:33 +0200 Subject: [PATCH] using utf8 String compression in Webstructure database --- source/net/yacy/kelondro/util/FileUtils.java | 15 ++++ .../peers/graphics/WebStructureGraph.java | 89 ++++++++++--------- 2 files changed, 60 insertions(+), 44 deletions(-) diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index beb03e34b..57a7c0da0 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -47,6 +47,7 @@ import java.io.Writer; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Comparator; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; @@ -506,6 +507,14 @@ public final class FileUtils } } + public static ConcurrentHashMap loadMapB(final File f) { + ConcurrentHashMap m = loadMap(f); + if (m == null) return null; + ConcurrentHashMap mb = new ConcurrentHashMap(); + for (Map.Entry e: m.entrySet()) mb.put(e.getKey(), UTF8.getBytes(e.getValue())); + return mb; + } + public static void saveMap(final File file, final Map props, final String comment) { PrintWriter pw = null; final File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000)); @@ -543,6 +552,12 @@ public final class FileUtils } } + public static void saveMapB(final File file, final Map props, final String comment) { + HashMap m = new HashMap(); + for (Map.Entry e: props.entrySet()) m.put(e.getKey(), UTF8.String(e.getValue())); + saveMap(file, m, comment); + } + public static Set loadSet(final File file, final int chunksize, final boolean tree) throws IOException { final Set set = diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index 86f3f8572..2ab3394c0 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -74,8 +74,8 @@ public class WebStructureGraph private final static Log log = new Log("WebStructureGraph"); private final File structureFile; - private final TreeMap structure_old; // ',' to {}* - private final TreeMap structure_new; + private final TreeMap structure_old; // ',' to {}* + private final TreeMap structure_new; private final BlockingQueue publicRefDNSResolvingQueue; private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker; @@ -93,35 +93,36 @@ public class WebStructureGraph } public WebStructureGraph(final File structureFile) { - this.structure_old = new TreeMap(); - this.structure_new = new TreeMap(); + this.structure_old = new TreeMap(); + this.structure_new = new TreeMap(); this.structureFile = structureFile; this.publicRefDNSResolvingQueue = new LinkedBlockingQueue(); // load web structure - Map loadedStructure; + Map loadedStructureB; try { - loadedStructure = + loadedStructureB = (this.structureFile.exists()) - ? FileUtils.loadMap(this.structureFile) - : new TreeMap(); + ? FileUtils.loadMapB(this.structureFile) + : new TreeMap(); } catch ( final OutOfMemoryError e ) { - loadedStructure = new TreeMap(); + loadedStructureB = new TreeMap(); } - if ( loadedStructure != null ) { - this.structure_old.putAll(loadedStructure); + if ( loadedStructureB != null ) { + this.structure_old.putAll(loadedStructureB); } // delete out-dated entries in case the structure is too big if ( this.structure_old.size() > maxhosts ) { // fill a set with last-modified - dates of the structure final TreeSet delset = new TreeSet(); - String key, value; - for ( final Map.Entry entry : this.structure_old.entrySet() ) { + String key; + byte[] value; + for ( final Map.Entry entry : this.structure_old.entrySet() ) { key = entry.getKey(); value = entry.getValue(); - if ( value.length() >= 8 ) { - delset.add(value.substring(0, 8) + key); + if ( value != null && value.length >= 8 ) { + delset.add(UTF8.String(value).substring(0, 8) + key); } } int delcount = this.structure_old.size() - (maxhosts * 9 / 10); @@ -271,7 +272,7 @@ public class WebStructureGraph public StructureEntry outgoingReferences(final String hosthash) { // returns a map with a hosthash(String):refcount(Integer) relation assert hosthash.length() == 6; - SortedMap tailMap; + SortedMap tailMap; Map h = new HashMap(); String hostname = ""; String date = ""; @@ -282,7 +283,7 @@ public class WebStructureGraph final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { hostname = key.substring(7); - ref = tailMap.get(key); + ref = UTF8.String(tailMap.get(key)); date = ref.substring(0, 8); h = refstr2map(ref); } @@ -293,7 +294,7 @@ public class WebStructureGraph if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { - ref = tailMap.get(key); + ref = UTF8.String(tailMap.get(key)); if ( hostname.length() == 0 ) { hostname = key.substring(7); } @@ -533,14 +534,14 @@ public class WebStructureGraph if ( hosthash == null || hosthash.length() != 6 ) { return 0; } - SortedMap tailMap; + SortedMap tailMap; int c = 0; synchronized ( this.structure_old ) { tailMap = this.structure_old.tailMap(hosthash); if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { - c = refstr2count(tailMap.get(key)); + c = refstr2count(UTF8.String(tailMap.get(key))); } } } @@ -549,7 +550,7 @@ public class WebStructureGraph if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { - c += refstr2count(tailMap.get(key)); + c += refstr2count(UTF8.String(tailMap.get(key))); } } } @@ -559,7 +560,7 @@ public class WebStructureGraph public String hostHash2hostName(final String hosthash) { // returns the host as string, null if unknown assert hosthash.length() == 6; - SortedMap tailMap; + SortedMap tailMap; synchronized ( this.structure_old ) { tailMap = this.structure_old.tailMap(hosthash); if ( !tailMap.isEmpty() ) { @@ -630,15 +631,15 @@ public class WebStructureGraph // store the map back to the structure synchronized ( this.structure_new ) { - this.structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs)); + this.structure_new.put(hosthash + "," + url.getHost(), UTF8.getBytes(map2refstr(refs))); } } - private static void joinStructure(final TreeMap into, final TreeMap from) { - for ( final Map.Entry e : from.entrySet() ) { + private static void joinStructure(final TreeMap into, final TreeMap from) { + for ( final Map.Entry e : from.entrySet() ) { if ( into.containsKey(e.getKey()) ) { - final Map s0 = refstr2map(into.get(e.getKey())); - final Map s1 = refstr2map(e.getValue()); + final Map s0 = refstr2map(UTF8.String(into.get(e.getKey()))); + final Map s1 = refstr2map(UTF8.String(e.getValue())); for ( final Map.Entry r : s1.entrySet() ) { if ( s0.containsKey(r.getKey()) ) { s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue()); @@ -646,7 +647,7 @@ public class WebStructureGraph s0.put(r.getKey(), r.getValue().intValue()); } } - into.put(e.getKey(), map2refstr(s0)); + into.put(e.getKey(), UTF8.getBytes(map2refstr(s0))); } else { into.put(e.getKey(), e.getValue()); } @@ -665,8 +666,8 @@ public class WebStructureGraph String maxhost = null; int refsize, maxref = 0; synchronized ( this.structure_old ) { - for ( final Map.Entry entry : this.structure_old.entrySet() ) { - refsize = entry.getValue().length(); + for ( final Map.Entry entry : this.structure_old.entrySet() ) { + refsize = entry.getValue().length; if ( refsize > maxref ) { maxref = refsize; maxhost = entry.getKey().substring(7); @@ -674,8 +675,8 @@ public class WebStructureGraph } } synchronized ( this.structure_new ) { - for ( final Map.Entry entry : this.structure_new.entrySet() ) { - refsize = entry.getValue().length(); + for ( final Map.Entry entry : this.structure_new.entrySet() ) { + refsize = entry.getValue().length; if ( refsize > maxref ) { maxref = refsize; maxhost = entry.getKey().substring(7); @@ -693,23 +694,22 @@ public class WebStructureGraph Iterator { - private final Iterator> i; + private final Iterator> i; private StructureIterator(final boolean latest) { - this.i = - ((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old) - .entrySet() - .iterator(); + this.i = ((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old).entrySet().iterator(); } @Override public StructureEntry next0() { - Map.Entry entry = null; - String dom = null, ref = ""; + Map.Entry entry = null; + String dom = null; + byte[] ref = null; + String refs; while ( this.i.hasNext() ) { entry = this.i.next(); ref = entry.getValue(); - if ( (ref.length() - 8) % 10 != 0 ) { + if ( (ref.length - 8) % 10 != 0 ) { continue; } dom = entry.getKey(); @@ -721,12 +721,13 @@ public class WebStructureGraph if ( entry == null || dom == null ) { return null; } - assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length(); + assert (ref.length - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length; + refs = UTF8.String(ref); return new StructureEntry( dom.substring(0, 6), dom.substring(7), - ref.substring(0, 8), - refstr2map(ref)); + refs.substring(0, 8), + refstr2map(refs)); } } @@ -772,7 +773,7 @@ public class WebStructureGraph synchronized ( this.structure_old ) { if ( this.structure_old.size() > 0 ) { FileUtils - .saveMap( + .saveMapB( this.structureFile, this.structure_old, "Web Structure Syntax: ',' to {}*");