using utf8 String compression in Webstructure database

pull/1/head
Michael Peter Christen 13 years ago
parent 26301a538d
commit 3b992e6b00

@ -47,6 +47,7 @@ import java.io.Writer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
@ -506,6 +507,14 @@ public final class FileUtils
}
}
public static ConcurrentHashMap<String, byte[]> loadMapB(final File f) {
ConcurrentHashMap<String, String> m = loadMap(f);
if (m == null) return null;
ConcurrentHashMap<String, byte[]> mb = new ConcurrentHashMap<String, byte[]>();
for (Map.Entry<String, String> e: m.entrySet()) mb.put(e.getKey(), UTF8.getBytes(e.getValue()));
return mb;
}
public static void saveMap(final File file, final Map<String, String> props, final String comment) {
PrintWriter pw = null;
final File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000));
@ -543,6 +552,12 @@ public final class FileUtils
}
}
public static void saveMapB(final File file, final Map<String, byte[]> props, final String comment) {
HashMap<String, String> m = new HashMap<String, String>();
for (Map.Entry<String, byte[]> e: props.entrySet()) m.put(e.getKey(), UTF8.String(e.getValue()));
saveMap(file, m, comment);
}
public static Set<String> loadSet(final File file, final int chunksize, final boolean tree)
throws IOException {
final Set<String> set =

@ -74,8 +74,8 @@ public class WebStructureGraph
private final static Log log = new Log("WebStructureGraph");
private final File structureFile;
private final TreeMap<String, String> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private final TreeMap<String, String> structure_new;
private final TreeMap<String, byte[]> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private final TreeMap<String, byte[]> structure_new;
private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue;
private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker;
@ -93,35 +93,36 @@ public class WebStructureGraph
}
public WebStructureGraph(final File structureFile) {
this.structure_old = new TreeMap<String, String>();
this.structure_new = new TreeMap<String, String>();
this.structure_old = new TreeMap<String, byte[]>();
this.structure_new = new TreeMap<String, byte[]>();
this.structureFile = structureFile;
this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<leanrefObject>();
// load web structure
Map<String, String> loadedStructure;
Map<String, byte[]> loadedStructureB;
try {
loadedStructure =
loadedStructureB =
(this.structureFile.exists())
? FileUtils.loadMap(this.structureFile)
: new TreeMap<String, String>();
? FileUtils.loadMapB(this.structureFile)
: new TreeMap<String, byte[]>();
} catch ( final OutOfMemoryError e ) {
loadedStructure = new TreeMap<String, String>();
loadedStructureB = new TreeMap<String, byte[]>();
}
if ( loadedStructure != null ) {
this.structure_old.putAll(loadedStructure);
if ( loadedStructureB != null ) {
this.structure_old.putAll(loadedStructureB);
}
// delete out-dated entries in case the structure is too big
if ( this.structure_old.size() > maxhosts ) {
// fill a set with last-modified - dates of the structure
final TreeSet<String> delset = new TreeSet<String>();
String key, value;
for ( final Map.Entry<String, String> entry : this.structure_old.entrySet() ) {
String key;
byte[] value;
for ( final Map.Entry<String, byte[]> entry : this.structure_old.entrySet() ) {
key = entry.getKey();
value = entry.getValue();
if ( value.length() >= 8 ) {
delset.add(value.substring(0, 8) + key);
if ( value != null && value.length >= 8 ) {
delset.add(UTF8.String(value).substring(0, 8) + key);
}
}
int delcount = this.structure_old.size() - (maxhosts * 9 / 10);
@ -271,7 +272,7 @@ public class WebStructureGraph
public StructureEntry outgoingReferences(final String hosthash) {
// returns a map with a hosthash(String):refcount(Integer) relation
assert hosthash.length() == 6;
SortedMap<String, String> tailMap;
SortedMap<String, byte[]> tailMap;
Map<String, Integer> h = new HashMap<String, Integer>();
String hostname = "";
String date = "";
@ -282,7 +283,7 @@ public class WebStructureGraph
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
hostname = key.substring(7);
ref = tailMap.get(key);
ref = UTF8.String(tailMap.get(key));
date = ref.substring(0, 8);
h = refstr2map(ref);
}
@ -293,7 +294,7 @@ public class WebStructureGraph
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
ref = tailMap.get(key);
ref = UTF8.String(tailMap.get(key));
if ( hostname.length() == 0 ) {
hostname = key.substring(7);
}
@ -533,14 +534,14 @@ public class WebStructureGraph
if ( hosthash == null || hosthash.length() != 6 ) {
return 0;
}
SortedMap<String, String> tailMap;
SortedMap<String, byte[]> tailMap;
int c = 0;
synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash);
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
c = refstr2count(tailMap.get(key));
c = refstr2count(UTF8.String(tailMap.get(key)));
}
}
}
@ -549,7 +550,7 @@ public class WebStructureGraph
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
c += refstr2count(tailMap.get(key));
c += refstr2count(UTF8.String(tailMap.get(key)));
}
}
}
@ -559,7 +560,7 @@ public class WebStructureGraph
public String hostHash2hostName(final String hosthash) {
// returns the host as string, null if unknown
assert hosthash.length() == 6;
SortedMap<String, String> tailMap;
SortedMap<String, byte[]> tailMap;
synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash);
if ( !tailMap.isEmpty() ) {
@ -630,15 +631,15 @@ public class WebStructureGraph
// store the map back to the structure
synchronized ( this.structure_new ) {
this.structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs));
this.structure_new.put(hosthash + "," + url.getHost(), UTF8.getBytes(map2refstr(refs)));
}
}
private static void joinStructure(final TreeMap<String, String> into, final TreeMap<String, String> from) {
for ( final Map.Entry<String, String> e : from.entrySet() ) {
private static void joinStructure(final TreeMap<String, byte[]> into, final TreeMap<String, byte[]> from) {
for ( final Map.Entry<String, byte[]> e : from.entrySet() ) {
if ( into.containsKey(e.getKey()) ) {
final Map<String, Integer> s0 = refstr2map(into.get(e.getKey()));
final Map<String, Integer> s1 = refstr2map(e.getValue());
final Map<String, Integer> s0 = refstr2map(UTF8.String(into.get(e.getKey())));
final Map<String, Integer> s1 = refstr2map(UTF8.String(e.getValue()));
for ( final Map.Entry<String, Integer> r : s1.entrySet() ) {
if ( s0.containsKey(r.getKey()) ) {
s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue());
@ -646,7 +647,7 @@ public class WebStructureGraph
s0.put(r.getKey(), r.getValue().intValue());
}
}
into.put(e.getKey(), map2refstr(s0));
into.put(e.getKey(), UTF8.getBytes(map2refstr(s0)));
} else {
into.put(e.getKey(), e.getValue());
}
@ -665,8 +666,8 @@ public class WebStructureGraph
String maxhost = null;
int refsize, maxref = 0;
synchronized ( this.structure_old ) {
for ( final Map.Entry<String, String> entry : this.structure_old.entrySet() ) {
refsize = entry.getValue().length();
for ( final Map.Entry<String, byte[]> entry : this.structure_old.entrySet() ) {
refsize = entry.getValue().length;
if ( refsize > maxref ) {
maxref = refsize;
maxhost = entry.getKey().substring(7);
@ -674,8 +675,8 @@ public class WebStructureGraph
}
}
synchronized ( this.structure_new ) {
for ( final Map.Entry<String, String> entry : this.structure_new.entrySet() ) {
refsize = entry.getValue().length();
for ( final Map.Entry<String, byte[]> entry : this.structure_new.entrySet() ) {
refsize = entry.getValue().length;
if ( refsize > maxref ) {
maxref = refsize;
maxhost = entry.getKey().substring(7);
@ -693,23 +694,22 @@ public class WebStructureGraph
Iterator<StructureEntry>
{
private final Iterator<Map.Entry<String, String>> i;
private final Iterator<Map.Entry<String, byte[]>> i;
private StructureIterator(final boolean latest) {
this.i =
((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old)
.entrySet()
.iterator();
this.i = ((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old).entrySet().iterator();
}
@Override
public StructureEntry next0() {
Map.Entry<String, String> entry = null;
String dom = null, ref = "";
Map.Entry<String, byte[]> entry = null;
String dom = null;
byte[] ref = null;
String refs;
while ( this.i.hasNext() ) {
entry = this.i.next();
ref = entry.getValue();
if ( (ref.length() - 8) % 10 != 0 ) {
if ( (ref.length - 8) % 10 != 0 ) {
continue;
}
dom = entry.getKey();
@ -721,12 +721,13 @@ public class WebStructureGraph
if ( entry == null || dom == null ) {
return null;
}
assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length();
assert (ref.length - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length;
refs = UTF8.String(ref);
return new StructureEntry(
dom.substring(0, 6),
dom.substring(7),
ref.substring(0, 8),
refstr2map(ref));
refs.substring(0, 8),
refstr2map(refs));
}
}
@ -772,7 +773,7 @@ public class WebStructureGraph
synchronized ( this.structure_old ) {
if ( this.structure_old.size() > 0 ) {
FileUtils
.saveMap(
.saveMapB(
this.structureFile,
this.structure_old,
"Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");

Loading…
Cancel
Save