using utf8 String compression in Webstructure database

pull/1/head
Michael Peter Christen 13 years ago
parent 26301a538d
commit 3b992e6b00

@ -47,6 +47,7 @@ import java.io.Writer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
@ -506,6 +507,14 @@ public final class FileUtils
} }
} }
public static ConcurrentHashMap<String, byte[]> loadMapB(final File f) {
ConcurrentHashMap<String, String> m = loadMap(f);
if (m == null) return null;
ConcurrentHashMap<String, byte[]> mb = new ConcurrentHashMap<String, byte[]>();
for (Map.Entry<String, String> e: m.entrySet()) mb.put(e.getKey(), UTF8.getBytes(e.getValue()));
return mb;
}
public static void saveMap(final File file, final Map<String, String> props, final String comment) { public static void saveMap(final File file, final Map<String, String> props, final String comment) {
PrintWriter pw = null; PrintWriter pw = null;
final File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000)); final File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000));
@ -543,6 +552,12 @@ public final class FileUtils
} }
} }
public static void saveMapB(final File file, final Map<String, byte[]> props, final String comment) {
HashMap<String, String> m = new HashMap<String, String>();
for (Map.Entry<String, byte[]> e: props.entrySet()) m.put(e.getKey(), UTF8.String(e.getValue()));
saveMap(file, m, comment);
}
public static Set<String> loadSet(final File file, final int chunksize, final boolean tree) public static Set<String> loadSet(final File file, final int chunksize, final boolean tree)
throws IOException { throws IOException {
final Set<String> set = final Set<String> set =

@ -74,8 +74,8 @@ public class WebStructureGraph
private final static Log log = new Log("WebStructureGraph"); private final static Log log = new Log("WebStructureGraph");
private final File structureFile; private final File structureFile;
private final TreeMap<String, String> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}* private final TreeMap<String, byte[]> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private final TreeMap<String, String> structure_new; private final TreeMap<String, byte[]> structure_new;
private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue; private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue;
private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker; private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker;
@ -93,35 +93,36 @@ public class WebStructureGraph
} }
public WebStructureGraph(final File structureFile) { public WebStructureGraph(final File structureFile) {
this.structure_old = new TreeMap<String, String>(); this.structure_old = new TreeMap<String, byte[]>();
this.structure_new = new TreeMap<String, String>(); this.structure_new = new TreeMap<String, byte[]>();
this.structureFile = structureFile; this.structureFile = structureFile;
this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<leanrefObject>(); this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<leanrefObject>();
// load web structure // load web structure
Map<String, String> loadedStructure; Map<String, byte[]> loadedStructureB;
try { try {
loadedStructure = loadedStructureB =
(this.structureFile.exists()) (this.structureFile.exists())
? FileUtils.loadMap(this.structureFile) ? FileUtils.loadMapB(this.structureFile)
: new TreeMap<String, String>(); : new TreeMap<String, byte[]>();
} catch ( final OutOfMemoryError e ) { } catch ( final OutOfMemoryError e ) {
loadedStructure = new TreeMap<String, String>(); loadedStructureB = new TreeMap<String, byte[]>();
} }
if ( loadedStructure != null ) { if ( loadedStructureB != null ) {
this.structure_old.putAll(loadedStructure); this.structure_old.putAll(loadedStructureB);
} }
// delete out-dated entries in case the structure is too big // delete out-dated entries in case the structure is too big
if ( this.structure_old.size() > maxhosts ) { if ( this.structure_old.size() > maxhosts ) {
// fill a set with last-modified - dates of the structure // fill a set with last-modified - dates of the structure
final TreeSet<String> delset = new TreeSet<String>(); final TreeSet<String> delset = new TreeSet<String>();
String key, value; String key;
for ( final Map.Entry<String, String> entry : this.structure_old.entrySet() ) { byte[] value;
for ( final Map.Entry<String, byte[]> entry : this.structure_old.entrySet() ) {
key = entry.getKey(); key = entry.getKey();
value = entry.getValue(); value = entry.getValue();
if ( value.length() >= 8 ) { if ( value != null && value.length >= 8 ) {
delset.add(value.substring(0, 8) + key); delset.add(UTF8.String(value).substring(0, 8) + key);
} }
} }
int delcount = this.structure_old.size() - (maxhosts * 9 / 10); int delcount = this.structure_old.size() - (maxhosts * 9 / 10);
@ -271,7 +272,7 @@ public class WebStructureGraph
public StructureEntry outgoingReferences(final String hosthash) { public StructureEntry outgoingReferences(final String hosthash) {
// returns a map with a hosthash(String):refcount(Integer) relation // returns a map with a hosthash(String):refcount(Integer) relation
assert hosthash.length() == 6; assert hosthash.length() == 6;
SortedMap<String, String> tailMap; SortedMap<String, byte[]> tailMap;
Map<String, Integer> h = new HashMap<String, Integer>(); Map<String, Integer> h = new HashMap<String, Integer>();
String hostname = ""; String hostname = "";
String date = ""; String date = "";
@ -282,7 +283,7 @@ public class WebStructureGraph
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) { if ( key.startsWith(hosthash) ) {
hostname = key.substring(7); hostname = key.substring(7);
ref = tailMap.get(key); ref = UTF8.String(tailMap.get(key));
date = ref.substring(0, 8); date = ref.substring(0, 8);
h = refstr2map(ref); h = refstr2map(ref);
} }
@ -293,7 +294,7 @@ public class WebStructureGraph
if ( !tailMap.isEmpty() ) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) { if ( key.startsWith(hosthash) ) {
ref = tailMap.get(key); ref = UTF8.String(tailMap.get(key));
if ( hostname.length() == 0 ) { if ( hostname.length() == 0 ) {
hostname = key.substring(7); hostname = key.substring(7);
} }
@ -533,14 +534,14 @@ public class WebStructureGraph
if ( hosthash == null || hosthash.length() != 6 ) { if ( hosthash == null || hosthash.length() != 6 ) {
return 0; return 0;
} }
SortedMap<String, String> tailMap; SortedMap<String, byte[]> tailMap;
int c = 0; int c = 0;
synchronized ( this.structure_old ) { synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash); tailMap = this.structure_old.tailMap(hosthash);
if ( !tailMap.isEmpty() ) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) { if ( key.startsWith(hosthash) ) {
c = refstr2count(tailMap.get(key)); c = refstr2count(UTF8.String(tailMap.get(key)));
} }
} }
} }
@ -549,7 +550,7 @@ public class WebStructureGraph
if ( !tailMap.isEmpty() ) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) { if ( key.startsWith(hosthash) ) {
c += refstr2count(tailMap.get(key)); c += refstr2count(UTF8.String(tailMap.get(key)));
} }
} }
} }
@ -559,7 +560,7 @@ public class WebStructureGraph
public String hostHash2hostName(final String hosthash) { public String hostHash2hostName(final String hosthash) {
// returns the host as string, null if unknown // returns the host as string, null if unknown
assert hosthash.length() == 6; assert hosthash.length() == 6;
SortedMap<String, String> tailMap; SortedMap<String, byte[]> tailMap;
synchronized ( this.structure_old ) { synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash); tailMap = this.structure_old.tailMap(hosthash);
if ( !tailMap.isEmpty() ) { if ( !tailMap.isEmpty() ) {
@ -630,15 +631,15 @@ public class WebStructureGraph
// store the map back to the structure // store the map back to the structure
synchronized ( this.structure_new ) { synchronized ( this.structure_new ) {
this.structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs)); this.structure_new.put(hosthash + "," + url.getHost(), UTF8.getBytes(map2refstr(refs)));
} }
} }
private static void joinStructure(final TreeMap<String, String> into, final TreeMap<String, String> from) { private static void joinStructure(final TreeMap<String, byte[]> into, final TreeMap<String, byte[]> from) {
for ( final Map.Entry<String, String> e : from.entrySet() ) { for ( final Map.Entry<String, byte[]> e : from.entrySet() ) {
if ( into.containsKey(e.getKey()) ) { if ( into.containsKey(e.getKey()) ) {
final Map<String, Integer> s0 = refstr2map(into.get(e.getKey())); final Map<String, Integer> s0 = refstr2map(UTF8.String(into.get(e.getKey())));
final Map<String, Integer> s1 = refstr2map(e.getValue()); final Map<String, Integer> s1 = refstr2map(UTF8.String(e.getValue()));
for ( final Map.Entry<String, Integer> r : s1.entrySet() ) { for ( final Map.Entry<String, Integer> r : s1.entrySet() ) {
if ( s0.containsKey(r.getKey()) ) { if ( s0.containsKey(r.getKey()) ) {
s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue()); s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue());
@ -646,7 +647,7 @@ public class WebStructureGraph
s0.put(r.getKey(), r.getValue().intValue()); s0.put(r.getKey(), r.getValue().intValue());
} }
} }
into.put(e.getKey(), map2refstr(s0)); into.put(e.getKey(), UTF8.getBytes(map2refstr(s0)));
} else { } else {
into.put(e.getKey(), e.getValue()); into.put(e.getKey(), e.getValue());
} }
@ -665,8 +666,8 @@ public class WebStructureGraph
String maxhost = null; String maxhost = null;
int refsize, maxref = 0; int refsize, maxref = 0;
synchronized ( this.structure_old ) { synchronized ( this.structure_old ) {
for ( final Map.Entry<String, String> entry : this.structure_old.entrySet() ) { for ( final Map.Entry<String, byte[]> entry : this.structure_old.entrySet() ) {
refsize = entry.getValue().length(); refsize = entry.getValue().length;
if ( refsize > maxref ) { if ( refsize > maxref ) {
maxref = refsize; maxref = refsize;
maxhost = entry.getKey().substring(7); maxhost = entry.getKey().substring(7);
@ -674,8 +675,8 @@ public class WebStructureGraph
} }
} }
synchronized ( this.structure_new ) { synchronized ( this.structure_new ) {
for ( final Map.Entry<String, String> entry : this.structure_new.entrySet() ) { for ( final Map.Entry<String, byte[]> entry : this.structure_new.entrySet() ) {
refsize = entry.getValue().length(); refsize = entry.getValue().length;
if ( refsize > maxref ) { if ( refsize > maxref ) {
maxref = refsize; maxref = refsize;
maxhost = entry.getKey().substring(7); maxhost = entry.getKey().substring(7);
@ -693,23 +694,22 @@ public class WebStructureGraph
Iterator<StructureEntry> Iterator<StructureEntry>
{ {
private final Iterator<Map.Entry<String, String>> i; private final Iterator<Map.Entry<String, byte[]>> i;
private StructureIterator(final boolean latest) { private StructureIterator(final boolean latest) {
this.i = this.i = ((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old).entrySet().iterator();
((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old)
.entrySet()
.iterator();
} }
@Override @Override
public StructureEntry next0() { public StructureEntry next0() {
Map.Entry<String, String> entry = null; Map.Entry<String, byte[]> entry = null;
String dom = null, ref = ""; String dom = null;
byte[] ref = null;
String refs;
while ( this.i.hasNext() ) { while ( this.i.hasNext() ) {
entry = this.i.next(); entry = this.i.next();
ref = entry.getValue(); ref = entry.getValue();
if ( (ref.length() - 8) % 10 != 0 ) { if ( (ref.length - 8) % 10 != 0 ) {
continue; continue;
} }
dom = entry.getKey(); dom = entry.getKey();
@ -721,12 +721,13 @@ public class WebStructureGraph
if ( entry == null || dom == null ) { if ( entry == null || dom == null ) {
return null; return null;
} }
assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length(); assert (ref.length - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length;
refs = UTF8.String(ref);
return new StructureEntry( return new StructureEntry(
dom.substring(0, 6), dom.substring(0, 6),
dom.substring(7), dom.substring(7),
ref.substring(0, 8), refs.substring(0, 8),
refstr2map(ref)); refstr2map(refs));
} }
} }
@ -772,7 +773,7 @@ public class WebStructureGraph
synchronized ( this.structure_old ) { synchronized ( this.structure_old ) {
if ( this.structure_old.size() > 0 ) { if ( this.structure_old.size() > 0 ) {
FileUtils FileUtils
.saveMap( .saveMapB(
this.structureFile, this.structureFile,
this.structure_old, this.structure_old,
"Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*"); "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");

Loading…
Cancel
Save