|
|
|
@ -99,7 +99,7 @@ public class WebStructureGraph {
|
|
|
|
|
Map<String, String> loadedStructure;
|
|
|
|
|
try {
|
|
|
|
|
loadedStructure = (this.structureFile.exists()) ? FileUtils.loadMap(this.structureFile) : new TreeMap<String, String>();
|
|
|
|
|
} catch (OutOfMemoryError e) {
|
|
|
|
|
} catch (final OutOfMemoryError e) {
|
|
|
|
|
loadedStructure = new TreeMap<String, String>();
|
|
|
|
|
}
|
|
|
|
|
if (loadedStructure != null) this.structure_old.putAll(loadedStructure);
|
|
|
|
@ -131,10 +131,10 @@ public class WebStructureGraph {
|
|
|
|
|
public void run() {
|
|
|
|
|
leanrefObject lro;
|
|
|
|
|
try {
|
|
|
|
|
while ((lro = publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON) {
|
|
|
|
|
while ((lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON) {
|
|
|
|
|
learnrefs(lro);
|
|
|
|
|
}
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
} catch (final InterruptedException e) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -155,25 +155,25 @@ public class WebStructureGraph {
|
|
|
|
|
globalRefURLs.add(u);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
leanrefObject lro = new leanrefObject(url, globalRefURLs);
|
|
|
|
|
final leanrefObject lro = new leanrefObject(url, globalRefURLs);
|
|
|
|
|
if (globalRefURLs.size() > 0) try {
|
|
|
|
|
if (this.publicRefDNSResolvingWorker.isAlive()) {
|
|
|
|
|
this.publicRefDNSResolvingQueue.put(lro);
|
|
|
|
|
} else {
|
|
|
|
|
this.learnrefs(lro);
|
|
|
|
|
learnrefs(lro);
|
|
|
|
|
}
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
this.learnrefs(lro);
|
|
|
|
|
} catch (final InterruptedException e) {
|
|
|
|
|
learnrefs(lro);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void learnrefs(final leanrefObject lro) {
|
|
|
|
|
final StringBuilder cpg = new StringBuilder(240);
|
|
|
|
|
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
|
|
|
|
|
final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part
|
|
|
|
|
//final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part
|
|
|
|
|
String nexturlhash;
|
|
|
|
|
for (MultiProtocolURI u: lro.globalRefURLs) {
|
|
|
|
|
byte[] nexturlhashb = new DigestURI(u).hash();
|
|
|
|
|
for (final MultiProtocolURI u: lro.globalRefURLs) {
|
|
|
|
|
final byte[] nexturlhashb = new DigestURI(u).hash();
|
|
|
|
|
assert nexturlhashb != null;
|
|
|
|
|
if (nexturlhashb != null) {
|
|
|
|
|
nexturlhash = ASCII.String(nexturlhashb);
|
|
|
|
@ -204,7 +204,7 @@ public class WebStructureGraph {
|
|
|
|
|
c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
|
|
|
|
|
try {
|
|
|
|
|
d = Integer.valueOf(c.substring(6), 16);
|
|
|
|
|
} catch (NumberFormatException e) {
|
|
|
|
|
} catch (final NumberFormatException e) {
|
|
|
|
|
d = 1;
|
|
|
|
|
}
|
|
|
|
|
map.put(c.substring(0, 6), d);
|
|
|
|
@ -219,7 +219,7 @@ public class WebStructureGraph {
|
|
|
|
|
for (final Map.Entry<String, Integer> entry : map.entrySet()) {
|
|
|
|
|
s.append(entry.getKey());
|
|
|
|
|
h = Integer.toHexString(entry.getValue().intValue());
|
|
|
|
|
int hl = h.length();
|
|
|
|
|
final int hl = h.length();
|
|
|
|
|
if (hl == 0) {
|
|
|
|
|
s.append("0000");
|
|
|
|
|
} else if (hl == 1) {
|
|
|
|
@ -245,8 +245,8 @@ public class WebStructureGraph {
|
|
|
|
|
String hostname = "";
|
|
|
|
|
String date = "";
|
|
|
|
|
String ref;
|
|
|
|
|
synchronized (structure_old) {
|
|
|
|
|
tailMap = structure_old.tailMap(hosthash);
|
|
|
|
|
synchronized (this.structure_old) {
|
|
|
|
|
tailMap = this.structure_old.tailMap(hosthash);
|
|
|
|
|
if (!tailMap.isEmpty()) {
|
|
|
|
|
final String key = tailMap.firstKey();
|
|
|
|
|
if (key.startsWith(hosthash)) {
|
|
|
|
@ -257,8 +257,8 @@ public class WebStructureGraph {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
synchronized (structure_new) {
|
|
|
|
|
tailMap = structure_new.tailMap(hosthash);
|
|
|
|
|
synchronized (this.structure_new) {
|
|
|
|
|
tailMap = this.structure_new.tailMap(hosthash);
|
|
|
|
|
if (!tailMap.isEmpty()) {
|
|
|
|
|
final String key = tailMap.firstKey();
|
|
|
|
|
if (key.startsWith(hosthash)) {
|
|
|
|
@ -274,11 +274,11 @@ public class WebStructureGraph {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public StructureEntry incomingReferences(final String hosthash) {
|
|
|
|
|
String hostname = hostHash2hostName(hosthash);
|
|
|
|
|
final String hostname = hostHash2hostName(hosthash);
|
|
|
|
|
if (hostname == null) return null;
|
|
|
|
|
// collect the references
|
|
|
|
|
WebStructureGraph.StructureEntry sentry;
|
|
|
|
|
HashMap<String, Integer> hosthashes = new HashMap<String, Integer>();
|
|
|
|
|
final HashMap<String, Integer> hosthashes = new HashMap<String, Integer>();
|
|
|
|
|
Iterator<WebStructureGraph.StructureEntry> i = new StructureIterator(false);
|
|
|
|
|
while (i.hasNext()) {
|
|
|
|
|
sentry = i.next();
|
|
|
|
@ -308,11 +308,11 @@ public class WebStructureGraph {
|
|
|
|
|
return hostReferenceRow;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public HostReference produceSlow(Entry e) {
|
|
|
|
|
public HostReference produceSlow(final Entry e) {
|
|
|
|
|
return new HostReference(e);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public HostReference produceFast(HostReference e) {
|
|
|
|
|
public HostReference produceFast(final HostReference e) {
|
|
|
|
|
return e;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -334,7 +334,7 @@ public class WebStructureGraph {
|
|
|
|
|
this.entry = hostReferenceFactory.getRow().newEntry(json, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public HostReference(Row.Entry entry) {
|
|
|
|
|
public HostReference(final Row.Entry entry) {
|
|
|
|
|
this.entry = entry;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -360,15 +360,15 @@ public class WebStructureGraph {
|
|
|
|
|
|
|
|
|
|
public void join(final Reference r) {
|
|
|
|
|
// joins two entries into one entry
|
|
|
|
|
HostReference oe = (HostReference) r;
|
|
|
|
|
final HostReference oe = (HostReference) r;
|
|
|
|
|
|
|
|
|
|
// combine date
|
|
|
|
|
long o = oe.lastModified();
|
|
|
|
|
if (this.lastModified() < o) this.entry.setCol(1, MicroDate.microDateDays(o));
|
|
|
|
|
final long o = oe.lastModified();
|
|
|
|
|
if (lastModified() < o) this.entry.setCol(1, MicroDate.microDateDays(o));
|
|
|
|
|
|
|
|
|
|
// combine count
|
|
|
|
|
int c = oe.count();
|
|
|
|
|
if (this.count() < c) this.entry.setCol(2, c);
|
|
|
|
|
final int c = oe.count();
|
|
|
|
|
if (count() < c) this.entry.setCol(2, c);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Collection<Integer> positions() {
|
|
|
|
@ -387,7 +387,7 @@ public class WebStructureGraph {
|
|
|
|
|
hostReferenceIndexCacheTime + hostReferenceIndexCacheTTL > System.currentTimeMillis()) return hostReferenceIndexCache;
|
|
|
|
|
|
|
|
|
|
// collect the references
|
|
|
|
|
ReferenceContainerCache<HostReference> idx = new ReferenceContainerCache<HostReference>(hostReferenceFactory, Base64Order.enhancedCoder, 6);
|
|
|
|
|
final ReferenceContainerCache<HostReference> idx = new ReferenceContainerCache<HostReference>(hostReferenceFactory, Base64Order.enhancedCoder, 6);
|
|
|
|
|
|
|
|
|
|
// we iterate over all structure entries.
|
|
|
|
|
// one structure entry has information that a specific host links to a list of other hosts
|
|
|
|
@ -402,23 +402,23 @@ public class WebStructureGraph {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void incomingReferencesEnrich(
|
|
|
|
|
ReferenceContainerCache<HostReference> idx,
|
|
|
|
|
Iterator<WebStructureGraph.StructureEntry> structureIterator,
|
|
|
|
|
long time) {
|
|
|
|
|
final ReferenceContainerCache<HostReference> idx,
|
|
|
|
|
final Iterator<WebStructureGraph.StructureEntry> structureIterator,
|
|
|
|
|
final long time) {
|
|
|
|
|
// we iterate over all structure entries.
|
|
|
|
|
// one structure entry has information that a specific host links to a list of other hosts
|
|
|
|
|
long timeout = System.currentTimeMillis() + time;
|
|
|
|
|
final long timeout = System.currentTimeMillis() + time;
|
|
|
|
|
byte[] term;
|
|
|
|
|
HostReference hr;
|
|
|
|
|
WebStructureGraph.StructureEntry sentry;
|
|
|
|
|
structureLoop: while (structureIterator.hasNext()) {
|
|
|
|
|
sentry = structureIterator.next();
|
|
|
|
|
// then we loop over all the hosts that are linked from sentry.hosthash
|
|
|
|
|
refloop: for (Map.Entry<String, Integer> refhosthashandcounter: sentry.references.entrySet()) {
|
|
|
|
|
refloop: for (final Map.Entry<String, Integer> refhosthashandcounter: sentry.references.entrySet()) {
|
|
|
|
|
term = UTF8.getBytes(refhosthashandcounter.getKey());
|
|
|
|
|
try {
|
|
|
|
|
hr = new HostReference(ASCII.getBytes(sentry.hosthash), GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), refhosthashandcounter.getValue().intValue());
|
|
|
|
|
} catch (ParseException e) {
|
|
|
|
|
} catch (final ParseException e) {
|
|
|
|
|
continue refloop;
|
|
|
|
|
}
|
|
|
|
|
// each term refers to an index entry. look if we already have such an entry
|
|
|
|
@ -431,7 +431,7 @@ public class WebStructureGraph {
|
|
|
|
|
} else {
|
|
|
|
|
r.put(hr);
|
|
|
|
|
}
|
|
|
|
|
} catch (RowSpaceExceededException e) {
|
|
|
|
|
} catch (final RowSpaceExceededException e) {
|
|
|
|
|
continue refloop;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -461,8 +461,8 @@ public class WebStructureGraph {
|
|
|
|
|
if (hosthash == null || hosthash.length() != 6) return 0;
|
|
|
|
|
SortedMap<String, String> tailMap;
|
|
|
|
|
int c = 0;
|
|
|
|
|
synchronized (structure_old) {
|
|
|
|
|
tailMap = structure_old.tailMap(hosthash);
|
|
|
|
|
synchronized (this.structure_old) {
|
|
|
|
|
tailMap = this.structure_old.tailMap(hosthash);
|
|
|
|
|
if (!tailMap.isEmpty()) {
|
|
|
|
|
final String key = tailMap.firstKey();
|
|
|
|
|
if (key.startsWith(hosthash)) {
|
|
|
|
@ -470,8 +470,8 @@ public class WebStructureGraph {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
synchronized (structure_new) {
|
|
|
|
|
tailMap = structure_new.tailMap(hosthash);
|
|
|
|
|
synchronized (this.structure_new) {
|
|
|
|
|
tailMap = this.structure_new.tailMap(hosthash);
|
|
|
|
|
if (!tailMap.isEmpty()) {
|
|
|
|
|
final String key = tailMap.firstKey();
|
|
|
|
|
if (key.startsWith(hosthash)) {
|
|
|
|
@ -486,8 +486,8 @@ public class WebStructureGraph {
|
|
|
|
|
// returns the host as string, null if unknown
|
|
|
|
|
assert hosthash.length() == 6;
|
|
|
|
|
SortedMap<String, String> tailMap;
|
|
|
|
|
synchronized(structure_old) {
|
|
|
|
|
tailMap = structure_old.tailMap(hosthash);
|
|
|
|
|
synchronized(this.structure_old) {
|
|
|
|
|
tailMap = this.structure_old.tailMap(hosthash);
|
|
|
|
|
if (!tailMap.isEmpty()) {
|
|
|
|
|
final String key = tailMap.firstKey();
|
|
|
|
|
if (key.startsWith(hosthash)) {
|
|
|
|
@ -495,8 +495,8 @@ public class WebStructureGraph {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
synchronized(structure_new) {
|
|
|
|
|
tailMap = structure_new.tailMap(hosthash);
|
|
|
|
|
synchronized(this.structure_new) {
|
|
|
|
|
tailMap = this.structure_new.tailMap(hosthash);
|
|
|
|
|
if (!tailMap.isEmpty()) {
|
|
|
|
|
final String key = tailMap.firstKey();
|
|
|
|
|
if (key.startsWith(hosthash)) {
|
|
|
|
@ -511,7 +511,7 @@ public class WebStructureGraph {
|
|
|
|
|
final String hosthash = ASCII.String(url.hash(), 6, 6);
|
|
|
|
|
|
|
|
|
|
// parse the new reference string and join it with the stored references
|
|
|
|
|
StructureEntry structure = outgoingReferences(hosthash);
|
|
|
|
|
final StructureEntry structure = outgoingReferences(hosthash);
|
|
|
|
|
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
|
|
|
|
|
assert reference.length() % 12 == 0 : "reference.length() = " + reference.length() + ", reference = " + reference.toString();
|
|
|
|
|
String dom;
|
|
|
|
@ -547,8 +547,8 @@ public class WebStructureGraph {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// store the map back to the structure
|
|
|
|
|
synchronized(structure_new) {
|
|
|
|
|
structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs));
|
|
|
|
|
synchronized(this.structure_new) {
|
|
|
|
|
this.structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -572,7 +572,7 @@ public class WebStructureGraph {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void joinOldNew() {
|
|
|
|
|
synchronized(structure_new) {
|
|
|
|
|
synchronized(this.structure_new) {
|
|
|
|
|
joinStructure(this.structure_old, this.structure_new);
|
|
|
|
|
this.structure_new.clear();
|
|
|
|
|
}
|
|
|
|
@ -583,8 +583,8 @@ public class WebStructureGraph {
|
|
|
|
|
String maxhost = null;
|
|
|
|
|
int refsize, maxref = 0;
|
|
|
|
|
joinOldNew();
|
|
|
|
|
synchronized(structure_new) {
|
|
|
|
|
for (final Map.Entry<String, String> entry : structure_old.entrySet()) {
|
|
|
|
|
synchronized(this.structure_new) {
|
|
|
|
|
for (final Map.Entry<String, String> entry : this.structure_old.entrySet()) {
|
|
|
|
|
refsize = entry.getValue().length();
|
|
|
|
|
if (refsize > maxref) {
|
|
|
|
|
maxref = refsize;
|
|
|
|
@ -604,14 +604,14 @@ public class WebStructureGraph {
|
|
|
|
|
private final Iterator<Map.Entry<String, String>> i;
|
|
|
|
|
|
|
|
|
|
private StructureIterator(final boolean latest) {
|
|
|
|
|
i = ((latest) ? structure_new : structure_old).entrySet().iterator();
|
|
|
|
|
this.i = ((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old).entrySet().iterator();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public StructureEntry next0() {
|
|
|
|
|
Map.Entry<String, String> entry = null;
|
|
|
|
|
String dom = null, ref = "";
|
|
|
|
|
while (i.hasNext()) {
|
|
|
|
|
entry = i.next();
|
|
|
|
|
while (this.i.hasNext()) {
|
|
|
|
|
entry = this.i.next();
|
|
|
|
|
ref = entry.getValue();
|
|
|
|
|
if ((ref.length() - 8) % 10 != 0) continue;
|
|
|
|
|
dom = entry.getKey();
|
|
|
|
@ -648,19 +648,19 @@ public class WebStructureGraph {
|
|
|
|
|
try {
|
|
|
|
|
this.publicRefDNSResolvingQueue.put(leanrefObjectPOISON);
|
|
|
|
|
this.publicRefDNSResolvingWorker.join(5000);
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
} catch (final InterruptedException e) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// save to web structure file
|
|
|
|
|
log.logInfo("Saving Web Structure File: new = " + this.structure_new.size() + " entries, old = " + this.structure_old.size() + " entries");
|
|
|
|
|
long time = System.currentTimeMillis();
|
|
|
|
|
final long time = System.currentTimeMillis();
|
|
|
|
|
joinOldNew();
|
|
|
|
|
if (this.structure_old.size() > 0) try {
|
|
|
|
|
synchronized(structure_old) {
|
|
|
|
|
synchronized(this.structure_old) {
|
|
|
|
|
if (this.structure_old.size() > 0) {
|
|
|
|
|
FileUtils.saveMap(this.structureFile, this.structure_old, "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
|
|
|
|
|
long t = Math.max(1, System.currentTimeMillis() - time);
|
|
|
|
|
final long t = Math.max(1, System.currentTimeMillis() - time);
|
|
|
|
|
log.logInfo("Saved Web Structure File: " + this.structure_old.size() + " entries in " + t + " milliseconds, " + (this.structure_old.size() * 1000 / t) + " entries/second");
|
|
|
|
|
}
|
|
|
|
|
this.structure_old.clear();
|
|
|
|
|