Added automated unit tests and perfs test for WebStructureGraph class.

Fixed references count when multiple links target the same domain name
in one document.
pull/105/head
luccioman 8 years ago
parent f793d97e56
commit 86adfef30f

@ -78,8 +78,9 @@ public class WebStructureGraph {
private final static ConcurrentLog log = new ConcurrentLog("WebStructureGraph");
/** Backup file */
/** Eventual backup file */
private final File structureFile;
/** Older structure entries (notably loaded from the backup file) */
private final TreeMap<String, byte[]> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
@ -95,11 +96,17 @@ public class WebStructureGraph {
/** Entry used to terminate the worker thread */
private final static LearnObject leanrefObjectPOISON = new LearnObject(null, null);
private static class LearnObject {
/**
* Used to feed a new entry to this web structure
*/
protected static class LearnObject {
/** Source URL */
private final DigestURL url;
/** Target link URLs */
private final Set<DigestURL> globalRefURLs;
private LearnObject(final DigestURL url, final Set<DigestURL> globalRefURLs) {
protected LearnObject(final DigestURL url, final Set<DigestURL> globalRefURLs) {
this.url = url;
this.globalRefURLs = globalRefURLs;
}
@ -118,20 +125,19 @@ public class WebStructureGraph {
this.structureFile = structureFile;
this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<LearnObject>();
// load web structure
// load web structure from file if exists
Map<String, byte[]> loadedStructureB;
try {
loadedStructureB =
(this.structureFile.exists())
? FileUtils.loadMapB(this.structureFile)
: new TreeMap<String, byte[]>();
if(this.structureFile != null && this.structureFile.exists()) {
loadedStructureB = FileUtils.loadMapB(this.structureFile);
log.info("loaded dump of " + loadedStructureB.size() + " entries from " + this.structureFile.toString());
} else {
loadedStructureB = new TreeMap<String, byte[]>();
}
} catch (final OutOfMemoryError e ) {
loadedStructureB = new TreeMap<String, byte[]>();
}
if ( loadedStructureB != null ) {
this.structure_old.putAll(loadedStructureB);
}
log.info("loaded dump of " + loadedStructureB.size() + " entries from " + this.structureFile.toString());
this.structure_old.putAll(loadedStructureB);
// delete out-dated entries in case the structure is too big
if ( this.structure_old.size() > maxhosts ) {
@ -611,33 +617,29 @@ public class WebStructureGraph {
}
private void learnrefs(final LearnObject lro) {
final Set<String> refhosts = new HashSet<String>();
String hosthash;
for ( final DigestURL u : lro.globalRefURLs ) {
if (Switchboard.getSwitchboard().shallTerminate()) break;
hosthash = ASCII.String(u.hash(), 6, 6);
if (!exists(hosthash)) {
// this must be recorded as an host with no references
synchronized ( this.structure_new ) {
this.structure_new.put(hosthash + "," + u.getHost(), UTF8.getBytes(none2refstr()));
}
}
refhosts.add(hosthash);
}
protected void learnrefs(final LearnObject lro) {
final DigestURL url = lro.url;
hosthash = ASCII.String(url.hash(), 6, 6);
final String sourceHosthash = ASCII.String(url.hash(), 6, 6);
// parse the new reference string and join it with the stored references
final StructureEntry structure = outgoingReferences(hosthash);
final StructureEntry structure = outgoingReferences(sourceHosthash);
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
int c;
for (String dom: refhosts) {
for (final DigestURL u : lro.globalRefURLs) {
String domain = ASCII.String(u.hash(), 6, 6);
if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().shallTerminate()) break;
if (!exists(domain)) {
// this must be recorded as an host with no references
synchronized ( this.structure_new ) {
this.structure_new.put(domain + "," + u.getHost(), UTF8.getBytes(none2refstr()));
}
}
c = 0;
if ( refs.containsKey(dom) ) {
c = (refs.get(dom)).intValue();
Integer existingCount = refs.get(domain);
if ( existingCount != null) {
c = existingCount.intValue();
}
refs.put(dom, Integer.valueOf(++c));
refs.put(domain, Integer.valueOf(++c));
}
// check if the maxref is exceeded
@ -667,7 +669,7 @@ public class WebStructureGraph {
// store the map back to the structure
synchronized ( this.structure_new ) {
this.structure_new.put(hosthash + "," + url.getHost(), UTF8.getBytes(map2refstr(refs)));
this.structure_new.put(sourceHosthash + "," + url.getHost(), UTF8.getBytes(map2refstr(refs)));
}
}
@ -781,11 +783,17 @@ public class WebStructureGraph {
}
public static class StructureEntry implements Comparable<StructureEntry> {
/** the tail of the host hash */
public String hosthash;
public String hosthash; // the tail of the host hash
public String hostname; // the host name
public String date; // date of latest change
public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host
/** the host name */
public String hostname;
/** date of latest change */
public String date;
/** a map from the referenced host hash to the number of referenced to that host */
public Map<String, Integer> references;
private StructureEntry(final String hosthash, final String hostname) {
this(hosthash, hostname, GenericFormatter.SHORT_DAY_FORMATTER.format(), new HashMap<String, Integer>());
@ -831,33 +839,35 @@ public class WebStructureGraph {
}
// save to web structure file
log.info("Saving Web Structure File: new = "
+ this.structure_new.size()
+ " entries, old = "
+ this.structure_old.size()
+ " entries");
final long time = System.currentTimeMillis();
joinOldNew();
log.info("dumping " + structure_old.size() + " entries to " + structureFile.toString());
if ( !this.structure_old.isEmpty() ) {
synchronized ( this.structure_old ) {
if ( !this.structure_old.isEmpty() ) {
FileUtils
.saveMapB(
this.structureFile,
this.structure_old,
"Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
final long t = Math.max(1, System.currentTimeMillis() - time);
log.info("Saved Web Structure File: "
+ this.structure_old.size()
+ " entries in "
+ t
+ " milliseconds, "
+ (this.structure_old.size() * 1000 / t)
+ " entries/second");
}
this.structure_old.clear();
}
if(this.structureFile != null) {
log.info("Saving Web Structure File: new = "
+ this.structure_new.size()
+ " entries, old = "
+ this.structure_old.size()
+ " entries");
final long time = System.currentTimeMillis();
joinOldNew();
log.info("dumping " + structure_old.size() + " entries to " + structureFile.toString());
if ( !this.structure_old.isEmpty() ) {
synchronized ( this.structure_old ) {
if ( !this.structure_old.isEmpty() ) {
FileUtils
.saveMapB(
this.structureFile,
this.structure_old,
"Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
final long t = Math.max(1, System.currentTimeMillis() - time);
log.info("Saved Web Structure File: "
+ this.structure_old.size()
+ " entries in "
+ t
+ " milliseconds, "
+ (this.structure_old.size() * 1000 / t)
+ " entries/second");
}
this.structure_old.clear();
}
}
}
}
}

@ -0,0 +1,263 @@
// WebStructureGraphTest.java
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.peers.graphics;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import org.junit.Assert;
import org.junit.Test;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.peers.graphics.WebStructureGraph.LearnObject;
import net.yacy.peers.graphics.WebStructureGraph.StructureEntry;
/**
* Unit tests for {@link WebStructureGraph}
*
* @author luccioman
*
*/
public class WebStructureGraphTest {
/**
* Most basic out going references unit test
*/
@Test
public void testOutgoingReferences() throws MalformedURLException {
WebStructureGraph graph = new WebStructureGraph(null);
try {
final DigestURL source = new DigestURL("http://source.net/index.html");
final String sourceHash = source.hosthash();
final Set<DigestURL> targets = new HashSet<>();
final DigestURL target = new DigestURL("http://target.com/index.html");
final String targetHash = target.hosthash();
targets.add(target);
LearnObject lro = new LearnObject(source, targets);
graph.learnrefs(lro);
/* Check that reference from the exact source URL is retrieved from structure */
StructureEntry outRefs = graph.outgoingReferences(sourceHash);
Assert.assertNotNull(outRefs);
Assert.assertEquals("source.net", outRefs.hostname);
Assert.assertNotNull(outRefs.references);
Assert.assertEquals(1, outRefs.references.size());
Assert.assertEquals(Integer.valueOf(1), outRefs.references.get(targetHash));
/* Check that reference from the host name URL is retrieved from structure */
outRefs = graph.outgoingReferences(new DigestURL("http://source.net").hosthash());
Assert.assertNotNull(outRefs);
Assert.assertEquals("source.net", outRefs.hostname);
Assert.assertNotNull(outRefs.references);
Assert.assertEquals(1, outRefs.references.size());
Assert.assertEquals(Integer.valueOf(1), outRefs.references.get(targetHash));
} finally {
graph.close();
}
}
/**
* Out going references from one source document to different resources on the same target host
*/
@Test
public void testOutgoingFromOneToMultipleSameTargeHost() throws MalformedURLException {
WebStructureGraph graph = new WebStructureGraph(null);
try {
final DigestURL source = new DigestURL("http://source.net/index.html");
final String sourceHash = source.hosthash();
final Set<DigestURL> targets = new HashSet<>();
final DigestURL indexTarget = new DigestURL("http://target.com/index.html");
targets.add(indexTarget);
final DigestURL pathTarget = new DigestURL("http://target.com/path/doc.html");
targets.add(pathTarget);
final DigestURL queryTarget = new DigestURL("http://target.com/path/query?param=value");
targets.add(queryTarget);
LearnObject lro = new LearnObject(source, targets);
graph.learnrefs(lro);
/* Check that accumulated references from the host name URL is retrieved from structure */
StructureEntry outRefs = graph.outgoingReferences(sourceHash);
Assert.assertNotNull(outRefs);
Assert.assertEquals("source.net", outRefs.hostname);
Assert.assertNotNull(outRefs.references);
/* One accumulated host target reference */
Assert.assertEquals(1, outRefs.references.size());
/* 3 accumulated links to that target host */
Assert.assertEquals(Integer.valueOf(3), outRefs.references.get(indexTarget.hosthash()));
} finally {
graph.close();
}
}
/**
* Most basic incoming references unit test
*/
@Test
public void testIncomingReferences() throws MalformedURLException {
WebStructureGraph graph = new WebStructureGraph(null);
try {
final DigestURL source = new DigestURL("http://source.net/index.html");
final String sourceHash = source.hosthash();
final Set<DigestURL> targets = new HashSet<>();
final DigestURL target = new DigestURL("http://target.com/index.html");
final String targetHash = target.hosthash();
targets.add(target);
LearnObject lro = new LearnObject(source, targets);
graph.learnrefs(lro);
/* Check that reference to the exact target URL is retrieved from structure */
StructureEntry inRefs = graph.incomingReferences(targetHash);
Assert.assertNotNull(inRefs);
Assert.assertEquals("target.com", inRefs.hostname);
Assert.assertNotNull(inRefs.references);
Assert.assertEquals(1, inRefs.references.size());
Assert.assertEquals(Integer.valueOf(1), inRefs.references.get(sourceHash));
/* Check that reference to the host name target URL is retrieved from structure */
inRefs = graph.incomingReferences(new DigestURL("http://target.com").hosthash());
Assert.assertNotNull(inRefs);
Assert.assertEquals("target.com", inRefs.hostname);
Assert.assertNotNull(inRefs.references);
Assert.assertEquals(1, inRefs.references.size());
Assert.assertEquals(Integer.valueOf(1), inRefs.references.get(sourceHash));
} finally {
graph.close();
}
}
/**
* Incoming references from multiple sources on the same host to one target URL
*/
@Test
public void testIncomingReferencesFromMultipleSourcesOnOneHost() throws MalformedURLException {
WebStructureGraph graph = new WebStructureGraph(null);
try {
final DigestURL indexSource = new DigestURL("http://source.net/index.html");
final String sourceHash = indexSource.hosthash();
Set<DigestURL> targets = new HashSet<>();
final DigestURL target = new DigestURL("http://target.com/index.html");
final String targetHash = target.hosthash();
targets.add(target);
LearnObject lro = new LearnObject(indexSource, targets);
graph.learnrefs(lro);
final DigestURL pathSource = new DigestURL("http://source.net/path/doc.html");
targets = new HashSet<>();
targets.add(target);
lro = new LearnObject(pathSource, targets);
graph.learnrefs(lro);
final DigestURL querySource = new DigestURL("http://source.net/query?param=value");
targets = new HashSet<>();
targets.add(target);
lro = new LearnObject(querySource, targets);
graph.learnrefs(lro);
/* Check that reference to the exact target URL is retrieved from structure */
StructureEntry inRefs = graph.incomingReferences(targetHash);
Assert.assertNotNull(inRefs);
Assert.assertEquals("target.com", inRefs.hostname);
Assert.assertNotNull(inRefs.references);
/* One accumulated host source reference */
Assert.assertEquals(1, inRefs.references.size());
/* 3 accumulated links from that host */
Assert.assertEquals(Integer.valueOf(3), inRefs.references.get(sourceHash));
} finally {
graph.close();
}
}
/**
* Simple performance measurements with a test structure filled to its limits.
*/
public static void main(String args[]) throws MalformedURLException {
WebStructureGraph graph = new WebStructureGraph(null);
try {
long beginTime = System.nanoTime();
/* Generate maxhosts structure entries */
for(int i = 0; i < WebStructureGraph.maxhosts; i++) {
final DigestURL source = new DigestURL("http://source" + i + ".net/index.html");
final Set<DigestURL> targets = new HashSet<>();
/* Generate maxref targets */
for(int j = 0; j < WebStructureGraph.maxref; j++) {
final DigestURL target = new DigestURL("http://target" + String.valueOf(j) + ".com/index.html");
targets.add(target);
}
LearnObject lro = new LearnObject(source, targets);
graph.learnrefs(lro);
}
long endTime = System.nanoTime();
System.out.println("testPerfs test structure initialisation time : " + ((endTime - beginTime) / 1000000000) + " seconds");
beginTime = System.nanoTime();
/* Loop and look for incoming references on each sample generated target */
for(int j = 0; j < WebStructureGraph.maxref; j++) {
String targetHash = new DigestURL("http://target" + j + ".com/index.html").hosthash();
graph.incomingReferences(targetHash);
}
endTime = System.nanoTime();
System.out.println("testPerfs incomingReferences running time : " + ((endTime - beginTime) / 1000000000) + " seconds");
beginTime = System.nanoTime();
/* Loop and look for outgoing references on each sample generated source */
for(int i = 0; i < WebStructureGraph.maxhosts; i++) {
String sourceHash = new DigestURL("http://source" + i + ".net/index.html").hosthash();
graph.outgoingReferences(sourceHash);
}
endTime = System.nanoTime();
System.out.println("testPerfs outgoingReferences running time : " + ((endTime - beginTime) / 1000000000) + " seconds");
} finally {
graph.close();
ConcurrentLog.shutdown();
}
}
}
Loading…
Cancel
Save