added a new yacy protocol servlet 'idx'. This returns an index to one of the data entities that is stored in YaCy.

This servlet currently only serves for indexes to the web structure hosts. It can be tested by calling
http://localhost:8090/yacy/idx.json?object=host
This yacy protocol servlet is the first one that returns JSON code and that also shows index entries in a readable format. This will make the development of API applications much easier. This is also an example implementation for possible json versions of the other existing YaCy protocol interfaces.

The main purpose of this new feature is to provide a distributed block rank collection feature. Creating a block rank is very difficult if the forward-link data is first collected and then one peer must create a backward-link index. This interface provides already a partial backward index and therefore a collection of all these indexes needs only to be joined which is very easy. The result should be the computation of new block rank tables that all peers can perform.

To reduce load from peers this servlet buffers all data and refreshes it only once in 12 hours. This very slow update cycle is needed because the interface will be called round-robin from all peers once after start-up.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7724 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent d326f1486a
commit 123375bfba

@ -279,7 +279,7 @@ public class IndexControlRWIs_p {
}
// make an indexContainerCache
ReferenceContainerCache<WordReference> icc = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, index.rowdef, Segment.wordOrder);
ReferenceContainerCache<WordReference> icc = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, index.row(), Segment.wordOrder);
try {
icc.add(index);
} catch (RowSpaceExceededException e) {

@ -5,6 +5,7 @@
//
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard;
@ -65,7 +66,10 @@ public class WatchWebStructure_p {
}
// find start point
if ((host == null) || (host.length() == 0) || (host.equals("auto"))) {
if (host == null ||
host.length() == 0 ||
host.equals("auto") ||
sb.webStructure.referencesCount(DigestURI.hosthash6(host)) == 0) {
// find domain with most references
besthost = sb.webStructure.hostWithMaxReferences();
} else {

@ -140,7 +140,7 @@ public class WebStructurePicture_p {
if (nextlayer == maxlayer) return mynodes;
nextlayer++;
final double radius = 1.0 / (1 << nextlayer);
WebStructureGraph.structureEntry sr = structure.outgoingReferences(centerhash);
WebStructureGraph.StructureEntry sr = structure.outgoingReferences(centerhash);
final Map<String, Integer> next = (sr == null) ? new HashMap<String, Integer>() : sr.references;
Map.Entry<String, Integer> entry;
String targethash, targethost;
@ -153,7 +153,7 @@ public class WebStructurePicture_p {
while ((i.hasNext()) && (maxnodes > 0) && (System.currentTimeMillis() < timeout)) {
entry = i.next();
targethash = entry.getKey();
targethost = structure.resolveDomHash2DomString(targethash);
targethost = structure.hostHash2hostName(targethash);
if (targethost == null) continue;
thisrefs = entry.getValue().intValue();
targetrefs = structure.referencesCount(targethash); // can be cpu/time-critical

@ -40,8 +40,7 @@ public class webstructure {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
final boolean latest = ((post == null) ? false : post.containsKey("latest"));
String about = ((post == null) ? null : post.get("about", null));
String about = post == null ? null : post.get("about", null);
prop.put("out", 0);
prop.put("in", 0);
if (about != null) {
@ -55,7 +54,7 @@ public class webstructure {
}
}
if (url != null && about != null) {
WebStructureGraph.structureEntry sentry = sb.webStructure.outgoingReferences(about);
WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(about);
if (sentry != null) {
reference(prop, "out", 0, sentry, sb.webStructure);
prop.put("out_domains", 1);
@ -74,10 +73,12 @@ public class webstructure {
prop.put("in", 1);
}
}
} else {
final Iterator<WebStructureGraph.structureEntry> i = sb.webStructure.structureEntryIterator(latest);
} else if (sb.adminAuthenticated(header) >= 2) {
// show a complete list of link structure informations in case that the user is authenticated
final boolean latest = ((post == null) ? false : post.containsKey("latest"));
final Iterator<WebStructureGraph.StructureEntry> i = sb.webStructure.structureEntryIterator(latest);
int c = 0;
WebStructureGraph.structureEntry sentry;
WebStructureGraph.StructureEntry sentry;
while (i.hasNext()) {
sentry = i.next();
reference(prop, "out", c, sentry, sb.webStructure);
@ -86,6 +87,10 @@ public class webstructure {
prop.put("out_domains", c);
prop.put("out", 1);
if (latest) sb.webStructure.joinOldNew();
} else {
// not-authenticated users show nothing
prop.put("out_domains", 0);
prop.put("out", 1);
}
prop.put("out_maxref", WebStructureGraph.maxref);
prop.put("maxhosts", WebStructureGraph.maxhosts);
@ -94,9 +99,9 @@ public class webstructure {
return prop;
}
public static void reference(serverObjects prop, String prefix, int c, WebStructureGraph.structureEntry sentry, WebStructureGraph ws) {
prop.put(prefix + "_domains_" + c + "_hash", sentry.domhash);
prop.put(prefix + "_domains_" + c + "_domain", sentry.domain);
public static void reference(serverObjects prop, String prefix, int c, WebStructureGraph.StructureEntry sentry, WebStructureGraph ws) {
prop.put(prefix + "_domains_" + c + "_hash", sentry.hosthash);
prop.put(prefix + "_domains_" + c + "_domain", sentry.hostname);
prop.put(prefix + "_domains_" + c + "_date", sentry.date);
Iterator<Map.Entry<String, Integer>> k = sentry.references.entrySet().iterator();
Map.Entry<String, Integer> refentry;
@ -106,7 +111,7 @@ public class webstructure {
refloop: while (k.hasNext()) {
refentry = k.next();
refhash = refentry.getKey();
refdom = ws.resolveDomHash2DomString(refhash);
refdom = ws.hostHash2hostName(refhash);
if (refdom == null) continue refloop;
prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refhash", refhash);
prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refdom", refdom);

@ -0,0 +1,84 @@
/**
* idx
* Copyright 2011 by Michael Peter Christen
* First released 16.05.2011 at http://yacy.net
*
* $LastChangedDate: 2011-03-08 02:51:51 +0100 (Di, 08 Mrz 2011) $
* $LastChangedRevision: 7567 $
* $LastChangedBy: low012 $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.Iterator;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceContainerCache;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.graphics.WebStructureGraph;
import de.anomic.yacy.graphics.WebStructureGraph.HostReference;
public final class idx {
// example:
// http://localhost:8090/yacy/idx.json?object=host
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
if (post == null || env == null) { return null; }
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
prop.put("list", 0);
prop.put("rowdef","");
prop.put("name","");
if (sb.adminAuthenticated(header) < 2 && !yacyNetwork.authentifyRequest(post, env)) {
return prop;
}
if (post.get("object", "").equals("host")) {
prop.put("name","host");
ReferenceContainerCache<HostReference> idx = sb.webStructure.incomingReferences();
prop.put("rowdef", WebStructureGraph.hostReferenceFacory.getRow().toString());
int count = 0;
for (ReferenceContainer<HostReference> references: idx) {
prop.put("list_" + count + "_term", UTF8.String(references.getTermHash()));
Iterator<HostReference> referenceIterator = references.entries();
StringBuilder s = new StringBuilder();
HostReference reference;
while (referenceIterator.hasNext()) {
reference = referenceIterator.next();
s.append(reference.toPropertyForm());
if (referenceIterator.hasNext()) s.append(",");
}
prop.put("list_" + count + "_references", s.toString());
prop.put("list_" + count + "_comma", 1);
count++;
}
prop.put("list_" + (count-1) + "_comma", 0);
prop.put("list", count);
}
// return rewrite properties
return prop;
}
}

@ -0,0 +1,10 @@
{
"version":"#[version]#",
"uptime":"#[uptime]#",
"name":"#[name]#",
"rowdef":"#[rowdef]#",
"idx":{
#{list}#"#[term]#":[#[references]#]#(comma)#::,#(/comma)#
#{/list}#
}
}

@ -45,6 +45,7 @@ import net.yacy.cora.storage.ScoreMap;
import net.yacy.cora.storage.WeakPriorityBlockingQueue;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.order.Bitfield;
@ -260,7 +261,7 @@ public final class search {
indexabstract.append("indexabstract.");
indexabstract.append(UTF8.String(wordhash));
indexabstract.append("=");
indexabstract.append(ReferenceContainer.compressIndex(container, null, 1000).toString());
indexabstract.append(WordReferenceFactory.compressIndex(container, null, 1000).toString());
indexabstract.append(serverCore.CRLF_STRING);
}
}

@ -35,11 +35,8 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import de.anomic.crawler.CrawlStacker;
@ -59,7 +56,6 @@ import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.repository.Blacklist;
@ -153,46 +149,6 @@ public final class MetadataRepository implements Iterable<byte[]> {
return null;
}
}
public void load(final WeakPriorityBlockingQueue<WordReferenceVars> obrwis, int maxcount, long maxtime, final BlockingQueue<URIMetadataRow> rows) {
if (urlIndexFile == null) return;
if (obrwis == null) return;
final Map<byte[], WeakPriorityBlockingQueue.Element<WordReferenceVars>> collector = new TreeMap<byte[], WeakPriorityBlockingQueue.Element<WordReferenceVars>>(Base64Order.enhancedCoder);
final List<byte[]> collectOrder = new ArrayList<byte[]>();
int count = 0;
long timelimit = System.currentTimeMillis() + maxtime;
WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi;
byte[] urlHash;
while (System.currentTimeMillis() < timelimit && count < maxcount) {
try {
obrwi = obrwis.take();
} catch (InterruptedException e) {
break;
}
if (obrwi != null) {
urlHash = obrwi.getElement().metadataHash();
if (urlHash != null) {
collector.put(urlHash, obrwi);
collectOrder.add(urlHash);
count++;
}
}
}
try {
Map<byte[], Row.Entry> resultmap = urlIndexFile.get(collector.keySet());
} catch (final IOException e) {
return;
} catch (InterruptedException e) {
return;
}
for (byte[] hash: collectOrder) {
WeakPriorityBlockingQueue.Element<WordReferenceVars> element = collector.get(hash);
if (element == null) continue;
}
}
public void store(final URIMetadataRow entry) throws IOException {
// Check if there is a more recent Entry already in the DB

@ -39,6 +39,7 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.storage.ScoreMap;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainer;
@ -185,7 +186,7 @@ public final class SearchEvent {
IAneardhthash = wordhash;
}
IACount.put(wordhash, LargeNumberCache.valueOf(container.size()));
IAResults.put(wordhash, ReferenceContainer.compressIndex(container, null, 1000).toString());
IAResults.put(wordhash, WordReferenceFactory.compressIndex(container, null, 1000).toString());
}
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), Type.ABSTRACTS, "", this.rankingProcess.searchContainerMap().size(), System.currentTimeMillis() - timer), false);
} else {

@ -397,7 +397,7 @@ public final class Switchboard extends serverSwitch {
this.proxyLastAccess = System.currentTimeMillis() - 10000;
this.localSearchLastAccess = System.currentTimeMillis() - 10000;
this.remoteSearchLastAccess = System.currentTimeMillis() - 10000;
this.webStructure = new WebStructureGraph(log, new File(queuesRoot, "webStructure.map"));
this.webStructure = new WebStructureGraph(new File(queuesRoot, "webStructure.map"));
// configuring list path
if (!(listsPath.exists())) {
@ -938,7 +938,7 @@ public final class Switchboard extends serverSwitch {
10000);
// create new web structure
this.webStructure = new WebStructureGraph(log, new File(queuesRoot, "webStructure.map"));
this.webStructure = new WebStructureGraph(new File(queuesRoot, "webStructure.map"));
// load domainList

@ -29,6 +29,9 @@ package de.anomic.yacy.graphics;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@ -47,38 +50,46 @@ import net.yacy.cora.document.UTF8;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.MicroDate;
import net.yacy.kelondro.rwi.AbstractReference;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceContainerCache;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.LookAheadIterator;
public class WebStructureGraph {
public static int maxCRLDump = 500000;
public static int maxCRGDump = 200000;
public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
public static int maxhosts = 20000; // maximum number of hosts in web structure map
private final Log log;
private final static Log log = new Log("WebStructureGraph");
private final File structureFile;
private final TreeMap<String, String> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private final TreeMap<String, String> structure_new;
private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue;
private final publicRefDNSResolvingProcess publicRefDNSResolvingWorker;
private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker;
private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null);
private static class leanrefObject {
public final DigestURI url;
public final Set<MultiProtocolURI> globalRefURLs;
public leanrefObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
private final DigestURI url;
private final Set<MultiProtocolURI> globalRefURLs;
private leanrefObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
this.url = url;
this.globalRefURLs = globalRefURLs;
}
}
public WebStructureGraph(final Log log, final File structureFile) {
this.log = log;
public WebStructureGraph(final File structureFile) {
this.structure_old = new TreeMap<String, String>();
this.structure_new = new TreeMap<String, String>();
this.structureFile = structureFile;
@ -110,12 +121,12 @@ public class WebStructureGraph {
delcount--;
}
}
this.publicRefDNSResolvingWorker = new publicRefDNSResolvingProcess();
this.publicRefDNSResolvingWorker = new PublicRefDNSResolvingProcess();
this.publicRefDNSResolvingWorker.start();
}
private class publicRefDNSResolvingProcess extends Thread {
public publicRefDNSResolvingProcess() {
private class PublicRefDNSResolvingProcess extends Thread {
private PublicRefDNSResolvingProcess() {
}
public void run() {
leanrefObject lro;
@ -155,7 +166,7 @@ public class WebStructureGraph {
}
}
public void learnrefs(final leanrefObject lro) {
private void learnrefs(final leanrefObject lro) {
final StringBuilder cpg = new StringBuilder(240);
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
final String refhashp = UTF8.String(lro.url.hash(), 6, 6); // ref hash part
@ -224,20 +235,20 @@ public class WebStructureGraph {
return s.toString();
}
public structureEntry outgoingReferences(final String domhash) {
// returns a map with a domhash(String):refcount(Integer) relation
assert domhash.length() == 6;
public StructureEntry outgoingReferences(final String hosthash) {
// returns a map with a hosthash(String):refcount(Integer) relation
assert hosthash.length() == 6;
SortedMap<String, String> tailMap;
Map<String, Integer> h = new HashMap<String, Integer>();
String domain = "";
String hostname = "";
String date = "";
String ref;
synchronized (structure_old) {
tailMap = structure_old.tailMap(domhash);
tailMap = structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) {
final String key = tailMap.firstKey();
if (key.startsWith(domhash)) {
domain = key.substring(7);
if (key.startsWith(hosthash)) {
hostname = key.substring(7);
ref = tailMap.get(key);
date = ref.substring(0, 8);
h = refstr2map(ref);
@ -245,82 +256,220 @@ public class WebStructureGraph {
}
}
synchronized (structure_new) {
tailMap = structure_new.tailMap(domhash);
tailMap = structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) {
final String key = tailMap.firstKey();
if (key.startsWith(domhash)) {
if (key.startsWith(hosthash)) {
ref = tailMap.get(key);
if (domain.length() == 0) domain = key.substring(7);
if (hostname.length() == 0) hostname = key.substring(7);
if (date.length() == 0) date = ref.substring(0, 8);
assert domain.equals(key.substring(7)) : "domain = " + domain + ", key = " + key;
h.putAll(refstr2map(ref));
}
}
}
if (h.isEmpty()) return null;
return new structureEntry(domhash, domain, date, h);
return new StructureEntry(hosthash, hostname, date, h);
}
public structureEntry incomingReferences(final String domhash) {
String host = resolveDomHash2DomString(domhash);
if (host == null) return null;
public StructureEntry incomingReferences(final String hosthash) {
String hostname = hostHash2hostName(hosthash);
if (hostname == null) return null;
// collect the references
WebStructureGraph.structureEntry sentry;
HashMap<String, Integer> domhashes = new HashMap<String, Integer>();
Iterator<WebStructureGraph.structureEntry> i = structureEntryIterator(false);
WebStructureGraph.StructureEntry sentry;
HashMap<String, Integer> hosthashes = new HashMap<String, Integer>();
Iterator<WebStructureGraph.StructureEntry> i = new StructureIterator(false);
while (i.hasNext()) {
sentry = i.next();
if (sentry.references.containsKey(domhash)) domhashes.put(sentry.domhash, sentry.references.get(domhash));
if (sentry.references.containsKey(hosthash)) hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
}
i = structureEntryIterator(true);
i = new StructureIterator(true);
while (i.hasNext()) {
sentry = i.next();
if (sentry.references.containsKey(domhash)) domhashes.put(sentry.domhash, sentry.references.get(domhash));
if (sentry.references.containsKey(hosthash)) hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
}
// construct a new structureEntry Object
return new structureEntry(
domhash,
host,
return new StructureEntry(
hosthash,
hostname,
GenericFormatter.SHORT_DAY_FORMATTER.format(),
domhashes);
hosthashes);
}
public static class HostReferenceFactory implements ReferenceFactory<HostReference> {
private static final Row hostReferenceRow = new Row("String h-6, Cardinal m-4 {b256}, Cardinal c-4 {b256}", Base64Order.enhancedCoder);
public HostReferenceFactory() {
}
public Row getRow() {
return hostReferenceRow;
}
public HostReference produceSlow(Entry e) {
return new HostReference(e);
}
public HostReference produceFast(HostReference e) {
return e;
}
}
public static class HostReference extends AbstractReference implements Reference {
private final Row.Entry entry;
public HostReference(final byte[] hostHash, final long modified, final int count) {
assert (hostHash.length == 6) : "hostHash = " + UTF8.String(hostHash);
this.entry = hostReferenceFacory.getRow().newEntry();
this.entry.setCol(0, hostHash);
this.entry.setCol(1, MicroDate.microDateDays(modified));
this.entry.setCol(2, count);
}
public HostReference(Row.Entry entry) {
this.entry = entry;
}
public String toPropertyForm() {
return this.entry.toPropertyForm(':', true, true, false, true);
}
public Entry toKelondroEntry() {
return this.entry;
}
public byte[] metadataHash() {
return this.entry.getPrimaryKeyBytes();
}
public int count() {
return (int) this.entry.getColLong(2);
}
public long lastModified() {
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(1));
}
public void join(final Reference r) {
// joins two entries into one entry
HostReference oe = (HostReference) r;
// combine date
long o = oe.lastModified();
if (this.lastModified() < o) this.entry.setCol(1, MicroDate.microDateDays(o));
// combine count
int c = oe.count();
if (this.count() < c) this.entry.setCol(2, c);
}
public Collection<Integer> positions() {
return new ArrayList<Integer>(0);
}
}
public HashMap<String, Integer> incomingDomains(final String domhash) {
public static final HostReferenceFactory hostReferenceFacory = new HostReferenceFactory();
public static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null;
public static long hostReferenceIndexCacheTime = 0;
public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache
public synchronized ReferenceContainerCache<HostReference> incomingReferences() {
// we return a cache if the cache is filled and not stale
if (hostReferenceIndexCache != null &&
hostReferenceIndexCacheTime + hostReferenceIndexCacheTTL > System.currentTimeMillis()) return hostReferenceIndexCache;
// collect the references
WebStructureGraph.structureEntry sentry;
HashMap<String, Integer> domains = new HashMap<String, Integer>();
Iterator<WebStructureGraph.structureEntry> i = structureEntryIterator(false);
while (i.hasNext()) {
sentry = i.next();
if (sentry.references.containsKey(domhash)) domains.put(sentry.domain, sentry.references.get(domhash));
HostReferenceFactory hostReferenceFactory = new HostReferenceFactory();
ReferenceContainerCache<HostReference> idx = new ReferenceContainerCache<HostReference>(hostReferenceFactory, hostReferenceFactory.getRow(), Base64Order.enhancedCoder);
// we iterate over all structure entries.
// one structure entry has information that a specific host links to a list of other hosts
incomingReferencesEnrich(idx, new StructureIterator(false), 3000);
incomingReferencesEnrich(idx, new StructureIterator(true), 3000);
// fill the cache again and set fill time
hostReferenceIndexCache = idx;
hostReferenceIndexCacheTime = System.currentTimeMillis();
//incomingReferencesTest(hostReferenceIndexCache);
return hostReferenceIndexCache;
}
private void incomingReferencesEnrich(
ReferenceContainerCache<HostReference> idx,
Iterator<WebStructureGraph.StructureEntry> structureIterator,
long time) {
// we iterate over all structure entries.
// one structure entry has information that a specific host links to a list of other hosts
long timeout = System.currentTimeMillis() + time;
byte[] term;
HostReference hr;
WebStructureGraph.StructureEntry sentry;
structureLoop: while (structureIterator.hasNext()) {
sentry = structureIterator.next();
// then we loop over all the hosts that are linked from sentry.hosthash
refloop: for (Map.Entry<String, Integer> refhosthashandcounter: sentry.references.entrySet()) {
term = UTF8.getBytes(refhosthashandcounter.getKey());
try {
hr = new HostReference(UTF8.getBytes(sentry.hosthash), GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), refhosthashandcounter.getValue().intValue());
} catch (ParseException e) {
continue refloop;
}
// each term refers to an index entry. look if we already have such an entry
ReferenceContainer<HostReference> r = idx.get(term, null);
try {
if (r == null) {
r = new ReferenceContainer<HostReference>(hostReferenceFacory, term);
r.add(hr);
idx.add(r);
} else {
r.put(hr);
}
} catch (RowSpaceExceededException e) {
continue refloop;
}
}
if (System.currentTimeMillis() > timeout) break structureLoop;
}
i = structureEntryIterator(true);
while (i.hasNext()) {
sentry = i.next();
if (sentry.references.containsKey(domhash)) domains.put(sentry.domain, sentry.references.get(domhash));
}
/*
private void incomingReferencesTest(ReferenceContainerCache<HostReference> idx) {
for (ReferenceContainer<HostReference> references: idx) {
log.logInfo("Term-Host: " + hostHash2hostName(UTF8.String(references.getTermHash())));
Iterator<HostReference> referenceIterator = references.entries();
StringBuilder s = new StringBuilder();
HostReference reference;
while (referenceIterator.hasNext()) {
reference = referenceIterator.next();
s.append(reference.toPropertyForm());
log.logInfo(" ... referenced by " + hostHash2hostName(UTF8.String(reference.metadataHash())) + ", " + reference.count() + " references");
}
}
return domains;
}
*/
public int referencesCount(final String domhash) {
// returns the number of domains that are referenced by this domhash
assert domhash.length() == 6 : "domhash = " + domhash;
public int referencesCount(final String hosthash) {
// returns the number of hosts that are referenced by this hosthash
assert hosthash.length() == 6 : "hosthash = " + hosthash;
if (hosthash == null || hosthash.length() != 6) return 0;
SortedMap<String, String> tailMap;
int c = 0;
synchronized (structure_old) {
tailMap = structure_old.tailMap(domhash);
tailMap = structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) {
final String key = tailMap.firstKey();
if (key.startsWith(domhash)) {
if (key.startsWith(hosthash)) {
c = refstr2count(tailMap.get(key));
}
}
}
synchronized (structure_new) {
tailMap = structure_new.tailMap(domhash);
tailMap = structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) {
final String key = tailMap.firstKey();
if (key.startsWith(domhash)) {
if (key.startsWith(hosthash)) {
c += refstr2count(tailMap.get(key));
}
}
@ -328,24 +477,24 @@ public class WebStructureGraph {
return c;
}
public String resolveDomHash2DomString(final String domhash) {
// returns the domain as string, null if unknown
assert domhash.length() == 6;
public String hostHash2hostName(final String hosthash) {
// returns the host as string, null if unknown
assert hosthash.length() == 6;
SortedMap<String, String> tailMap;
synchronized(structure_old) {
tailMap = structure_old.tailMap(domhash);
tailMap = structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) {
final String key = tailMap.firstKey();
if (key.startsWith(domhash)) {
if (key.startsWith(hosthash)) {
return key.substring(7);
}
}
}
synchronized(structure_new) {
tailMap = structure_new.tailMap(domhash);
tailMap = structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) {
final String key = tailMap.firstKey();
if (key.startsWith(domhash)) {
if (key.startsWith(hosthash)) {
return key.substring(7);
}
}
@ -354,10 +503,10 @@ public class WebStructureGraph {
}
private void learn(final DigestURI url, final StringBuilder reference /*string of b64(12digits)-hashes*/) {
final String domhash = UTF8.String(url.hash(), 6, 6);
final String hosthash = UTF8.String(url.hash(), 6, 6);
// parse the new reference string and join it with the stored references
structureEntry structure = outgoingReferences(domhash);
StructureEntry structure = outgoingReferences(hosthash);
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
assert reference.length() % 12 == 0 : "reference.length() = " + reference.length() + ", reference = " + reference.toString();
String dom;
@ -394,7 +543,7 @@ public class WebStructureGraph {
// store the map back to the structure
synchronized(structure_new) {
structure_new.put(domhash + "," + url.getHost(), map2refstr(refs));
structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs));
}
}
@ -424,7 +573,7 @@ public class WebStructureGraph {
}
}
public void saveWebStructure() {
private void saveWebStructure() {
joinOldNew();
try {
synchronized(structure_old) {
@ -436,7 +585,7 @@ public class WebStructureGraph {
}
public String hostWithMaxReferences() {
// find domain with most references
// find host with most references
String maxhost = null;
int refsize, maxref = 0;
joinOldNew();
@ -452,20 +601,19 @@ public class WebStructureGraph {
return maxhost;
}
public Iterator<structureEntry> structureEntryIterator(final boolean latest) {
// iterates objects of type structureEntry
return new structureIterator(latest);
public Iterator<StructureEntry> structureEntryIterator(final boolean latest) {
return new StructureIterator(latest);
}
public class structureIterator extends LookAheadIterator<structureEntry> implements Iterator<structureEntry> {
private class StructureIterator extends LookAheadIterator<StructureEntry> implements Iterator<StructureEntry> {
private final Iterator<Map.Entry<String, String>> i;
public structureIterator(final boolean latest) {
private StructureIterator(final boolean latest) {
i = ((latest) ? structure_new : structure_old).entrySet().iterator();
}
public structureEntry next0() {
public StructureEntry next0() {
Map.Entry<String, String> entry = null;
String dom = null, ref = "";
while (i.hasNext()) {
@ -478,20 +626,22 @@ public class WebStructureGraph {
}
if (entry == null || dom == null) return null;
assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length();
return new structureEntry(dom.substring(0, 6), dom.substring(7), ref.substring(0, 8), refstr2map(ref));
return new StructureEntry(dom.substring(0, 6), dom.substring(7), ref.substring(0, 8), refstr2map(ref));
}
}
public static class structureEntry {
public String domhash, domain, date;
public Map<String, Integer> references;
public structureEntry(
final String domhash,
final String domain,
public static class StructureEntry {
public String hosthash; // the tail of the host hash
public String hostname; // the host name
public String date; // date of latest change
public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host
private StructureEntry(
final String hosthash,
final String hostname,
final String date,
final Map<String, Integer> references) {
this.domhash = domhash;
this.domain = domain;
this.hosthash = hosthash;
this.hostname = hostname;
this.date = date;
this.references = references;
}

@ -69,6 +69,7 @@ import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
@ -571,7 +572,7 @@ public final class yacyClient {
continue;
}
whacc += wordhash;
secondarySearchSuperviser.addAbstract(wordhash, ReferenceContainer.decompressIndex(ci, target.hash));
secondarySearchSuperviser.addAbstract(wordhash, WordReferenceFactory.decompressIndex(ci, target.hash));
ac++;
}

@ -42,7 +42,7 @@ import de.anomic.server.serverSwitch;
public class yacyNetwork {
public static final boolean authentifyRequest(final serverObjects post, final serverSwitch env) {
if ((post == null) || (env == null)) return false;
if (post == null || env == null) return false;
// identify network
final String unitName = post.get(SwitchboardConstants.NETWORK_NAME, yacySeed.DFLT_NETWORK_UNIT); // the network unit

@ -129,7 +129,7 @@ public final class CitationReferenceRow implements Reference /*, Cloneable*/ {
}
public String toPropertyForm() {
return entry.toPropertyForm(true, true, false);
return entry.toPropertyForm('=', true, true, false, false);
}
public Entry toKelondroEntry() {

@ -204,7 +204,7 @@ public final class ImageReferenceRow extends AbstractReference implements /*Imag
}
public String toPropertyForm() {
return entry.toPropertyForm(true, true, false);
return entry.toPropertyForm('=', true, true, false, false);
}
public Entry toKelondroEntry() {

@ -111,7 +111,7 @@ public final class NavigationReferenceRow extends AbstractReference implements N
}
public String toPropertyForm() {
return entry.toPropertyForm(true, true, false);
return entry.toPropertyForm('=', true, true, false, false);
}
public Entry toKelondroEntry() {

@ -26,9 +26,16 @@
package net.yacy.kelondro.data.word;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.ByteBuffer;
public class WordReferenceFactory implements ReferenceFactory<WordReference> {
@ -45,4 +52,96 @@ public class WordReferenceFactory implements ReferenceFactory<WordReference> {
return WordReferenceRow.urlEntryRow;
}
/**
* create an index abstract for a given WordReference ReferenceContainer
* This extracts all the host hashes from a reference Container and returns a byte buffer
* with a compressed representation of the host references
* @param <ReferenceType>
* @param inputContainer
* @param excludeContainer
* @param maxtime
* @return
*/
public static final <ReferenceType extends WordReference> ByteBuffer compressIndex(final ReferenceContainer<WordReference> inputContainer, final ReferenceContainer<WordReference> excludeContainer, final long maxtime) {
// collect references according to domains
final long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
final TreeMap<String, StringBuilder> doms = new TreeMap<String, StringBuilder>();
synchronized (inputContainer) {
final Iterator<WordReference> i = inputContainer.entries();
WordReference iEntry;
String dom, mod;
StringBuilder paths;
while (i.hasNext()) {
iEntry = i.next();
if ((excludeContainer != null) && (excludeContainer.getReference(iEntry.metadataHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = UTF8.String(iEntry.metadataHash(), 6, 6);
mod = UTF8.String(iEntry.metadataHash(), 0, 6);
if ((paths = doms.get(dom)) == null) {
doms.put(dom, new StringBuilder(30).append(mod));
} else {
doms.put(dom, paths.append(mod));
}
if (System.currentTimeMillis() > timeout)
break;
}
}
// construct a result string
final ByteBuffer bb = new ByteBuffer(inputContainer.size() * 6);
bb.append('{');
final Iterator<Map.Entry<String, StringBuilder>> i = doms.entrySet().iterator();
Map.Entry<String, StringBuilder> entry;
while (i.hasNext()) {
entry = i.next();
bb.append(entry.getKey());
bb.append(':');
bb.append(entry.getValue().toString());
if (System.currentTimeMillis() > timeout)
break;
if (i.hasNext())
bb.append(',');
}
bb.append('}');
return bb;
}
/**
* decompress an index abstract that was generated from a word index and transmitted over a network connection
* @param ci
* @param peerhash
* @return
*/
public static final TreeMap<String, StringBuilder> decompressIndex(ByteBuffer ci, final String peerhash) {
TreeMap<String, StringBuilder> target = new TreeMap<String, StringBuilder>();
// target is a mapping from url-hashes to a string of peer-hashes
if (ci.byteAt(0) != '{' || ci.byteAt(ci.length() - 1) != '}') return target;
//System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
ci = ci.trim(1, ci.length() - 2);
String dom, url;
StringBuilder peers;
StringBuilder urlsb;
while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
dom = ci.toStringBuilder(0, 6, 6).toString();
ci.trim(7);
while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
urlsb = ci.toStringBuilder(0, 6, 12);
urlsb.append(dom);
url = urlsb.toString();
ci.trim(6);
peers = target.get(url);
if (peers == null) {
peers = new StringBuilder(24);
peers.append(peerhash);
target.put(url, peers);
} else {
peers.append(peerhash);
}
//System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
}
if (ci.byteAt(0) == ',') ci.trim(1);
}
return target;
}
}

@ -229,7 +229,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
}
public String toPropertyForm() {
return entry.toPropertyForm(true, true, false);
return entry.toPropertyForm('=', true, true, false, false);
}
public Entry toKelondroEntry() {
@ -322,12 +322,6 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
return toPropertyForm();
}
public boolean isOlder(final Reference other) {
if (other == null) return false;
if (this.lastModified() < other.lastModified()) return true;
return false;
}
@Override
public boolean equals(final Object obj) {
if (this == obj) return true;

@ -196,11 +196,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
public Bitfield flags() {
return flags;
}
/*
public long freshUntil() {
return freshUntil;
}
*/
public byte[] getLanguage() {
return language;
}
@ -213,11 +209,6 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
return hitcount;
}
public boolean isOlder(final Reference other) {
assert false; // should not be used
return false;
}
public long lastModified() {
return lastModified;
}

@ -79,7 +79,7 @@ public final class Row {
this.objectOrder = objectOrder;
// define row with row syntax
// example:
//# Structure=<pivot-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>
//# Structure=<pivot-12>,<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>
// parse pivot definition:
//does not work with 'String idx-26 "id = created + originator",String cat-8,String rec-14,short dis-2 {b64e},String att-462'
@ -553,22 +553,26 @@ public final class Row {
System.arraycopy(rowinstance, offset + colstart[column], target, targetOffset, row[column].cellwidth);
}
public final String toPropertyForm(final boolean includeBraces, final boolean decimalCardinal, final boolean longname) {
public final String toPropertyForm(final char propertySymbol, final boolean includeBraces, final boolean decimalCardinal, final boolean longname, final boolean quotes) {
final ByteBuffer bb = new ByteBuffer(objectsize() * 2);
if (includeBraces) bb.append('{');
for (int i = 0; i < row.length; i++) {
if (quotes) bb.append('"');
bb.append((longname) ? row[i].description : row[i].nickname);
bb.append('=');
if (quotes) bb.append('"');
bb.append(propertySymbol);
if (quotes) bb.append('"');
if ((decimalCardinal) && (row[i].celltype == Column.celltype_cardinal)) {
bb.append(Long.toString(getColLong(i)));
} else if ((decimalCardinal) && (row[i].celltype == Column.celltype_bitfield)) {
bb.append((new Bitfield(getColBytes(i, true))).exportB64());
} else if ((decimalCardinal) && (row[i].celltype == Column.celltype_binary)) {
assert row[i].cellwidth == 1;
assert row[i].cellwidth == 1 : toString();
bb.append(Integer.toString((0xff & getColByte(i))));
} else {
bb.append(rowinstance, offset + colstart[i], row[i].cellwidth);
}
if (quotes) bb.append('"');
if (i < row.length - 1) {
bb.append(',');
if (longname) bb.append(' ');
@ -581,7 +585,7 @@ public final class Row {
@Override
public final String toString() {
return toPropertyForm(true, false, false);
return toPropertyForm('=', true, false, false, false);
}
}

@ -86,7 +86,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
new ThreadPoolExecutor.CallerRunsPolicy())
: null;
public final Row rowdef;
protected final Row rowdef;
protected byte[] chunkcache;
protected int chunkcount;
protected int sortBound;

@ -108,4 +108,10 @@ public abstract class AbstractReference implements Reference {
}
return d / (positions().size() - 1);
}
public boolean isOlder(final Reference other) {
if (other == null) return false;
if (this.lastModified() < other.lastModified()) return true;
return false;
}
}

@ -30,7 +30,6 @@ import java.lang.reflect.Method;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
@ -41,7 +40,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.util.ByteBuffer;
/**
@ -513,82 +511,4 @@ public class ReferenceContainer<ReferenceType extends Reference> extends RowSet
return (int) Base64Order.enhancedCoder.decodeLong(this.termHash, 0, 4);
}
public static final <ReferenceType extends Reference> ByteBuffer compressIndex(final ReferenceContainer<ReferenceType> inputContainer, final ReferenceContainer<ReferenceType> excludeContainer, final long maxtime) {
// collect references according to domains
final long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
final TreeMap<String, StringBuilder> doms = new TreeMap<String, StringBuilder>();
synchronized (inputContainer) {
final Iterator<ReferenceType> i = inputContainer.entries();
Reference iEntry;
String dom, mod;
StringBuilder paths;
while (i.hasNext()) {
iEntry = i.next();
if ((excludeContainer != null) && (excludeContainer.getReference(iEntry.metadataHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = UTF8.String(iEntry.metadataHash(), 6, 6);
mod = UTF8.String(iEntry.metadataHash(), 0, 6);
if ((paths = doms.get(dom)) == null) {
doms.put(dom, new StringBuilder(30).append(mod));
} else {
doms.put(dom, paths.append(mod));
}
if (System.currentTimeMillis() > timeout)
break;
}
}
// construct a result string
final ByteBuffer bb = new ByteBuffer(inputContainer.size() * 6);
bb.append('{');
final Iterator<Map.Entry<String, StringBuilder>> i = doms.entrySet().iterator();
Map.Entry<String, StringBuilder> entry;
while (i.hasNext()) {
entry = i.next();
bb.append(entry.getKey());
bb.append(':');
bb.append(entry.getValue().toString());
if (System.currentTimeMillis() > timeout)
break;
if (i.hasNext())
bb.append(',');
}
bb.append('}');
return bb;
}
public static final TreeMap<String, StringBuilder> decompressIndex(ByteBuffer ci, final String peerhash) {
TreeMap<String, StringBuilder> target = new TreeMap<String, StringBuilder>();
// target is a mapping from url-hashes to a string of peer-hashes
if (ci.byteAt(0) != '{' || ci.byteAt(ci.length() - 1) != '}') return target;
//System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
ci = ci.trim(1, ci.length() - 2);
String dom, url;
StringBuilder peers;
StringBuilder urlsb;
while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
dom = ci.toStringBuilder(0, 6, 6).toString();
ci.trim(7);
while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
urlsb = ci.toStringBuilder(0, 6, 12);
urlsb.append(dom);
url = urlsb.toString();
ci.trim(6);
peers = target.get(url);
if (peers == null) {
peers = new StringBuilder(24);
peers.append(peerhash);
target.put(url, peers);
} else {
peers.append(peerhash);
}
//System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
}
if (ci.byteAt(0) == ',') ci.trim(1);
}
return target;
}
}

Loading…
Cancel
Save