refactoring of clickdepth computation as preparation for clickdepth

computation of webgraph links
pull/1/head
Michael Peter Christen 12 years ago
parent 2080fc7406
commit 6300730d7f

@ -69,7 +69,6 @@ import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingQueue;
@ -99,8 +98,6 @@ import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.JenaTripleStore;
@ -2296,64 +2293,8 @@ public final class Switchboard extends serverSwitch {
// if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned
if (this.crawlQueues.coreCrawlJobSize() == 0 && index.connectedCitation() && index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt)) {
// that means we must search for those entries.
index.fulltext().getDefaultConnector().commit(true); // make sure that we have latest information that can be found
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
BlockingQueue<SolrDocument> docs = index.fulltext().getDefaultConnector().concurrentQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
SolrDocument doc;
int proccount_clickdepth = 0;
int proccount_clickdepthchange = 0;
int proccount_referencechange = 0;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// for each to-be-processed entry work on the process tag
Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
for (Object tag: proctags) {
String tagname = (String) tag;
ProcessType tagtype = ProcessType.valueOf(tagname);
// switch over tag types
if (tagtype == ProcessType.CLICKDEPTH) {
//proctags.remove(tag);
if (index.fulltext().getDefaultConfiguration().contains(CollectionSchema.clickdepth_i)) {
DigestURI url;
try {
// get new click depth and compare with old
Integer oldclickdepth = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())));
int clickdepth = CollectionConfiguration.getClickDepth(index.urlCitation(), url);
if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) {
//log.logInfo("new clickdepth " + clickdepth + " for " + url.toNormalform(true));
proccount_clickdepthchange++;
}
SolrInputDocument sid = index.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
sid.setField(CollectionSchema.clickdepth_i.getSolrFieldName(), clickdepth);
// refresh the link count; it's 'cheap' to do this here
if (index.fulltext().getDefaultConfiguration().contains(CollectionSchema.references_i)) {
Integer oldreferences = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
int references = index.urlCitation().count(url.hash());
if (references > 0) {
if (oldreferences == null || oldreferences.intValue() != references) proccount_referencechange++;
sid.setField(CollectionSchema.references_i.getSolrFieldName(), references);
}
}
// remove the processing tag
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
// send back to index
index.fulltext().getDefaultConnector().add(sid);
proccount_clickdepth++;
} catch (Throwable e) {
Log.logException(e);
}
}
}
}
}
log.logInfo("cleanup_processing: re-calculated " + proccount_clickdepth + " new clickdepth values, " + proccount_clickdepthchange + " clickdepth values changed, " + proccount_referencechange + " reference-count values changed.");
if (this.crawlQueues.coreCrawlJobSize() == 0) {
index.fulltext().getDefaultConfiguration().postprocessing(index);
}
return true;

@ -61,15 +61,18 @@ import net.yacy.document.Parser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.ISO639;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.LoaderDispatcher;
@ -201,6 +204,65 @@ public class Segment {
return this.urlCitationIndex;
}
/**
* compute the click level using the citation reference database
* @param citations the citation database
* @param searchhash the hash of the url to be checked
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
* @throws IOException
*/
public int getClickDepth(final DigestURI url) throws IOException {
final byte[] searchhash = url.hash();
RowHandleSet rootCandidates = url.getPossibleRootHashes();
RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);}
int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte[] hosthash = new byte[6]; // the host of the url to be checked
System.arraycopy(searchhash, 6, hosthash, 0, 6);
long timeout = System.currentTimeMillis() + 10000;
for (int maxdepth = 0; maxdepth < 10 && System.currentTimeMillis() < timeout; maxdepth++) {
RowHandleSet checknext = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
checkloop: for (byte[] urlhash: levelhashes) {
// get all the citations for this url and iterate
ReferenceContainer<CitationReference> references = this.urlCitationIndex.get(urlhash, null);
if (references == null || references.size() == 0) continue checkloop; // don't know
Iterator<CitationReference> i = references.entries();
nextloop: while (i.hasNext()) {
CitationReference ref = i.next();
if (ref == null) continue nextloop;
byte[] u = ref.urlhash();
// check ignore
if (ignore.has(u)) continue nextloop;
// check if this is from the same host
if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
// check if the url is a root url
if (rootCandidates.has(u)) {
return leveldepth + 1;
}
// step to next depth level
try {checknext.put(u);} catch (SpaceExceededException e) {}
try {ignore.put(u);} catch (SpaceExceededException e) {}
}
}
leveldepth++;
levelhashes = checknext;
}
return 999;
}
public long URLCount() {
return this.fulltext.collectionSize();
}

@ -39,6 +39,7 @@ import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
@ -48,11 +49,12 @@ import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
@ -62,12 +64,10 @@ import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.search.index.Segment;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
@ -761,63 +761,77 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return doc;
}
/**
* compute the click level using the citation reference database
* @param citations the citation database
* @param searchhash the hash of the url to be checked
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
* @throws IOException
* post-processing steps for all entries that have a process tag assigned
* @param connector
* @param urlCitation
* @return
*/
public static int getClickDepth(final IndexCell<CitationReference> citations, final DigestURI url) throws IOException {
final byte[] searchhash = url.hash();
RowHandleSet rootCandidates = url.getPossibleRootHashes();
RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);}
int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte[] hosthash = new byte[6]; // the host of the url to be checked
System.arraycopy(searchhash, 6, hosthash, 0, 6);
long timeout = System.currentTimeMillis() + 10000;
for (int maxdepth = 0; maxdepth < 10 && System.currentTimeMillis() < timeout; maxdepth++) {
RowHandleSet checknext = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
checkloop: for (byte[] urlhash: levelhashes) {
// get all the citations for this url and iterate
ReferenceContainer<CitationReference> references = citations.get(urlhash, null);
if (references == null || references.size() == 0) continue checkloop; // don't know
Iterator<CitationReference> i = references.entries();
nextloop: while (i.hasNext()) {
CitationReference ref = i.next();
if (ref == null) continue nextloop;
byte[] u = ref.urlhash();
// check ignore
if (ignore.has(u)) continue nextloop;
// check if this is from the same host
if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
// check if the url is a root url
if (rootCandidates.has(u)) {
return leveldepth + 1;
public void postprocessing(Segment segment) {
if (!this.contains(CollectionSchema.process_sxt)) return;
if (!segment.connectedCitation()) return;
SolrConnector connector = segment.fulltext().getDefaultConnector();
// that means we must search for those entries.
connector.commit(true); // make sure that we have latest information that can be found
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
BlockingQueue<SolrDocument> docs = connector.concurrentQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
SolrDocument doc;
int proccount_clickdepth = 0;
int proccount_clickdepthchange = 0;
int proccount_referencechange = 0;
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// for each to-be-processed entry work on the process tag
Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
for (Object tag: proctags) {
String tagname = (String) tag;
ProcessType tagtype = ProcessType.valueOf(tagname);
// switch over tag types
if (tagtype == ProcessType.CLICKDEPTH) {
//proctags.remove(tag);
if (this.contains(CollectionSchema.clickdepth_i)) {
DigestURI url;
try {
// get new click depth and compare with old
Integer oldclickdepth = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())));
int clickdepth = segment.getClickDepth(url);
if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) {
//log.logInfo("new clickdepth " + clickdepth + " for " + url.toNormalform(true));
proccount_clickdepthchange++;
}
SolrInputDocument sid = this.toSolrInputDocument(doc);
sid.setField(CollectionSchema.clickdepth_i.getSolrFieldName(), clickdepth);
// refresh the link count; it's 'cheap' to do this here
if (this.contains(CollectionSchema.references_i)) {
Integer oldreferences = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
int references = segment.urlCitation().count(url.hash());
if (references > 0) {
if (oldreferences == null || oldreferences.intValue() != references) proccount_referencechange++;
sid.setField(CollectionSchema.references_i.getSolrFieldName(), references);
}
}
// remove the processing tag
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
// send back to index
connector.add(sid);
proccount_clickdepth++;
} catch (Throwable e) {
Log.logException(e);
}
}
}
// step to next depth level
try {checknext.put(u);} catch (SpaceExceededException e) {}
try {ignore.put(u);} catch (SpaceExceededException e) {}
}
}
leveldepth++;
levelhashes = checknext;
Log.logInfo("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount_clickdepth + " new clickdepth values, " + proccount_clickdepthchange + " clickdepth values changed, " + proccount_referencechange + " reference-count values changed.");
} catch (InterruptedException e) {
}
return 999;
}
/**

Loading…
Cancel
Save