enhanced webgraph processing

pull/1/head
Michael Peter Christen 11 years ago
parent 9d8b32c63a
commit 0db8e34625

@ -81,8 +81,8 @@ public class Crawler_p {
prop.putNum("urlpublictextSize", fulltext.collectionSize());
prop.putNum("urlpublictextSegmentCount", fulltext.getDefaultConnector().getSegmentCount());
prop.put("webgraphSolrURL", fulltext.connectedLocalSolr() ? localSolr.replace("collection1", "webgraph") : remoteSolr + "webgraph/select?&q=*:*&start=0&rows=3");
prop.putNum("webgraphSize", fulltext.writeToWebgraph() ? fulltext.webgraphSize() : 0);
prop.putNum("webgraphSegmentCount", fulltext.writeToWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
prop.putNum("webgraphSize", fulltext.useWebgraph() ? fulltext.webgraphSize() : 0);
prop.putNum("webgraphSegmentCount", fulltext.useWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
prop.putNum("citationSize", segment.citationCount());
prop.putNum("citationSegmentCount", segment.citationSegmentCount());
prop.putNum("rwipublictextSize", segment.RWICount());

@ -87,7 +87,7 @@ public class IndexControlURLs_p {
prop.put("cleanup", post == null ? 1 : 0);
prop.put("cleanup_solr", segment.fulltext().connectedRemoteSolr() ? 1 : 0);
prop.put("cleanup_rwi", segment.termIndex() != null && !segment.termIndex().isEmpty() ? 1 : 0);
prop.put("cleanup_citation", segment.urlCitation() != null && !segment.urlCitation().isEmpty() ? 1 : 0);
prop.put("cleanup_citation", segment.connectedCitation() && !segment.urlCitation().isEmpty() ? 1 : 0);
// show export messages
final Fulltext.Export export = segment.fulltext().export();
@ -159,7 +159,7 @@ public class IndexControlURLs_p {
if (segment.termIndex() != null) try {segment.termIndex().clear();} catch (final IOException e) {}
}
if ( post.get("deleteCitation", "").equals("on")) {
if (segment.urlCitation() != null) try {segment.urlCitation().clear();} catch (final IOException e) {}
if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {}
}
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
sb.crawlQueues.clear();

@ -70,7 +70,7 @@ public class IndexFederated_p {
sb.index.connectCitation(wordCacheMaxCount, fileSizeMax);
} catch (final IOException e) { ConcurrentLog.logException(e); } // switch on
boolean webgraph = post.getBoolean(SwitchboardConstants.CORE_SERVICE_WEBGRAPH);
sb.index.fulltext().writeWebgraph(webgraph);
sb.index.fulltext().setUseWebgraph(webgraph);
env.setConfig(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, webgraph);
}

@ -81,8 +81,8 @@ public class status_p {
// index size
prop.putNum("urlpublictextSize", fulltext.collectionSize());
prop.putNum("urlpublictextSegmentCount", fulltext.getDefaultConnector().getSegmentCount());
prop.putNum("webgraphSize", fulltext.writeToWebgraph() ? fulltext.webgraphSize() : 0);
prop.putNum("webgraphSegmentCount", fulltext.writeToWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
prop.putNum("webgraphSize", fulltext.useWebgraph() ? fulltext.webgraphSize() : 0);
prop.putNum("webgraphSegmentCount", fulltext.useWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
prop.putNum("citationSize", segment.citationCount());
prop.putNum("citationSegmentCount", segment.citationSegmentCount());
prop.putNum("rwipublictextSize", segment.RWICount());
@ -131,8 +131,8 @@ public class status_p {
prop.put("postprocessingRunning", Switchboard.postprocessingRunning ? 1 : 0);
boolean processCollection = sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt) && (sb.index.connectedCitation() || sb.index.fulltext().writeToWebgraph());
boolean processWebgraph = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.process_sxt) && sb.index.fulltext().writeToWebgraph();
boolean processCollection = sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt) && (sb.index.connectedCitation() || sb.index.fulltext().useWebgraph());
boolean processWebgraph = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.process_sxt) && sb.index.fulltext().useWebgraph();
long collectionTimeSinceStart = processCollection && Switchboard.postprocessingRunning ? System.currentTimeMillis() - Switchboard.postprocessingStartTime[0] : 0;
long webgraphTimeSinceStart = processWebgraph && Switchboard.postprocessingRunning ? System.currentTimeMillis() - Switchboard.postprocessingStartTime[1] : 0;

@ -126,7 +126,7 @@ public class yacydoc {
prop.putXML("yacy_referrer_url", (le == null) ? "" : le.url().toNormalform(true));
prop.put("yacy_size", entry.size());
prop.put("yacy_words", entry.wordCount());
prop.put("yacy_citations", sb.index.urlCitation()!= null ? sb.index.urlCitation().count(entry.hash()) : 0);
prop.put("yacy_citations", sb.index.connectedCitation() ? sb.index.urlCitation().count(entry.hash()) : 0);
prop.put("yacy_inbound", entry.llocal());
prop.put("yacy_outbound", entry.lother());

@ -181,12 +181,12 @@ public class OpenSearchConnector {
if (sb == null) {
return false;
}
final SolrConnector connector = sb.index.fulltext().writeToWebgraph() ? null : sb.index.fulltext().getWebgraphConnector();
// check if needed Solr fields are available (selected)
if (connector == null) {
if (!sb.index.fulltext().useWebgraph()) {
ConcurrentLog.severe("OpenSearchConnector.Discover", "Error on connecting to embedded Solr webgraph index");
return false;
}
final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& ( sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()) )
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);

@ -57,7 +57,6 @@ import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.peers.SeedDB;
import net.yacy.repository.Blacklist.BlacklistType;
@ -138,11 +137,14 @@ public final class CrawlStacker {
// record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument
byte[] anchorhash = entry.url().hash();
IndexCell<CitationReference> urlCitationIndex = this.indexSegment.urlCitation();
if (urlCitationIndex != null && entry.referrerhash() != null) try {
urlCitationIndex.add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
} catch (final Exception e) {
ConcurrentLog.logException(e);
if (entry.referrerhash() != null) {
if (this.indexSegment.connectedCitation()) try {
this.indexSegment.urlCitation().add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
} catch (final Exception e) {
ConcurrentLog.logException(e);
}
// TODO: write to webgraph??
}
try {

@ -505,7 +505,7 @@ public final class Switchboard extends serverSwitch {
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);}
}
this.index.fulltext().writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
// set up the solr interface
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
@ -1328,7 +1328,7 @@ public final class Switchboard extends serverSwitch {
this.index.fulltext().connectLocalSolr();
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
}
this.index.fulltext().writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
// set up the solr interface
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
@ -2327,11 +2327,11 @@ public final class Switchboard extends serverSwitch {
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
int cleanupByHarvestkey = deletionCandidates.size();
boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.writeToWebgraph());
boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.writeToWebgraph();
boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.useWebgraph());
boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.useWebgraph();
if ((processCollection || processWebgraph) && (cleanupByHarvestkey > 0 || allCrawlsFinished)) {
//full optimization of webgraph, if exists
if (fulltext.writeToWebgraph()) fulltext.getWebgraphConnector().optimize(1);
if (fulltext.useWebgraph()) fulltext.getWebgraphConnector().optimize(1);
if (cleanupByHarvestkey > 0) {
// run postprocessing on these profiles
postprocessingRunning = true;

@ -79,7 +79,7 @@ public class DocumentIndex extends Segment {
false // exceed134217727
);
super.fulltext().connectLocalSolr();
super.fulltext().writeWebgraph(true);
super.fulltext().setUseWebgraph(true);
this.callback = callback;
this.queue = new LinkedBlockingQueue<AnchorURL>(WorkflowProcessor.availableCPU * 300);
this.worker = new Worker[WorkflowProcessor.availableCPU];

@ -110,11 +110,11 @@ public final class Fulltext {
this.writeWebgraph = false;
}
public void writeWebgraph(boolean check) {
public void setUseWebgraph(boolean check) {
this.writeWebgraph = check;
}
public boolean writeToWebgraph() {
public boolean useWebgraph() {
return this.writeWebgraph;
}
@ -403,7 +403,7 @@ public final class Fulltext {
}
public void putEdges(final Collection<SolrInputDocument> edges) throws IOException {
if (!this.writeToWebgraph()) return;
if (!this.useWebgraph()) return;
if (edges == null || edges.size() == 0) return;
try {
this.getWebgraphConnector().add(edges);

@ -382,9 +382,9 @@ public class Segment {
}
} catch (SpaceExceededException e) {
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
if (Segment.this.fulltext.writeToWebgraph()) internalIDs.clear();
if (Segment.this.fulltext.useWebgraph()) internalIDs.clear();
}
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.writeToWebgraph()) {
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
// reqd the references from the webgraph
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), 0, 10000000, 1000, 100, WebgraphSchema.source_id_s.getSolrFieldName());
@ -663,9 +663,8 @@ public class Segment {
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
}
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
// STORE TO SOLR
String error = null;
this.putDocumentInQueue(vector);
@ -673,7 +672,7 @@ public class Segment {
if (webgraph != null && webgraph.size() > 0) {
// write the edges to the webgraph solr index
if (this.fulltext.writeToWebgraph()) {
if (this.fulltext.useWebgraph()) {
tryloop: for (int i = 0; i < 20; i++) {
try {
error = null;

@ -328,6 +328,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
/**
* a SolrVector is a SolrInputDocument with the ability
* to store also the webgraph that is associated with
* the web document in the Solr document.
*/
public static class SolrVector extends SolrInputDocument {
private static final long serialVersionUID = -210901881471714939L;
private List<SolrInputDocument> webgraphDocuments;
@ -891,9 +896,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
*/
public int postprocessing(final Segment segment, ReferenceReportCache rrCache, ClickdepthCache clickdepthCache, String harvestkey) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation() && !segment.fulltext().writeToWebgraph()) return 0;
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
SolrConnector webgraphConnector = segment.fulltext().useWebgraph() ? segment.fulltext().getWebgraphConnector() : null;
collectionConnector.commit(false); // make sure that we have latest information that can be found
if (webgraphConnector != null) webgraphConnector.commit(false);
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);

@ -120,185 +120,198 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
final String sourceName) {
boolean allAttr = this.isEmpty();
int target_order = 0;
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
int target_order = 0;
for (final AnchorURL target_url: links) {
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, clickdepth_source, images, inbound,
sourceName, allAttr, generalNofollow, target_order, target_url);
target_order++;
// add the edge to the subgraph
subgraph.edges.add(edge);
}
}
public SolrInputDocument getEdge(
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final List<ImageEntry> images, final boolean inbound,
final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
String rel = target_url.getRelProperty(); // the rel-attribute
int ioidx = inbound ? 0 : 1;
if (generalNofollow) {
// patch the rel attribute since the header makes nofollow valid for all links
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
}
// index organization
StringBuilder idi = new StringBuilder(8);
idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase());
while (idi.length() < 8) idi.insert(0, '0');
String source_id = ASCII.String(source.hash());
String target_id = ASCII.String(target_url.hash());
StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi);
SolrInputDocument edge = new SolrInputDocument();
add(edge, WebgraphSchema.id, id.toString());
add(edge, WebgraphSchema.target_order_i, target_order++);
if (allAttr || contains(WebgraphSchema.load_date_dt)) {
Date loadDate = new Date();
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
add(edge, WebgraphSchema.load_date_dt, loadDate);
}
if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
final String source_url_string = source.toNormalform(false);
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey());
}
add(edge, WebgraphSchema.collection_sxt, cs);
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
String rel = target_url.getRelProperty(); // the rel-attribute
int ioidx = inbound ? 0 : 1;
if (generalNofollow) {
// patch the rel attribute since the header makes nofollow valid for all links
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
}
// index organization
StringBuilder idi = new StringBuilder(8);
idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase());
while (idi.length() < 8) idi.insert(0, '0');
String source_id = ASCII.String(source.hash());
String target_id = ASCII.String(target_url.hash());
StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi);
SolrInputDocument edge = new SolrInputDocument();
add(edge, WebgraphSchema.id, id.toString());
add(edge, WebgraphSchema.target_order_i, target_order);
if (allAttr || contains(WebgraphSchema.load_date_dt)) {
Date loadDate = new Date();
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
add(edge, WebgraphSchema.load_date_dt, loadDate);
}
if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
final String source_url_string = source.toNormalform(false);
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey());
}
add(edge, WebgraphSchema.collection_sxt, cs);
}
// add the source attributes
add(edge, WebgraphSchema.source_id_s, source_id);
int pr_source = source_url_string.indexOf("://",0);
if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));
Map<String, String> source_searchpart = source.getSearchpartMap();
if (source_searchpart == null) {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size());
if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length());
String source_host = null;
if ((source_host = source.getHost()) != null) {
String dnc = Domains.getDNC(source_host);
String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host);
if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash());
if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) {
String source_file_name = source.getFileName();
String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name);
add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name);
add(edge, WebgraphSchema.source_file_ext_s, source_file_ext);
}
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
String[] paths = source.getPaths();
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
}
if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH);
}
// add the source attributes about the target
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
ImageEntry ientry = null;
for (ImageEntry ie: images) {
if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
}
String alttext = ientry == null ? "" : ientry.alt();
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
// add the target attributes
add(edge, WebgraphSchema.target_id_s, target_id);
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
subgraph.urlAnchorTexts[ioidx].add(text);
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
Map<String, String> target_searchpart = target_url.getSearchpartMap();
if (target_searchpart == null) {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size());
if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length());
String target_host = null;
if ((target_host = target_url.getHost()) != null) {
String dnc = Domains.getDNC(target_host);
String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host);
if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash());
if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) {
String target_file_name = target_url.getFileName();
String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name);
add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name);
add(edge, WebgraphSchema.target_file_ext_s, target_file_ext);
}
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
String[] paths = target_url.getPaths();
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
}
// add the source attributes
add(edge, WebgraphSchema.source_id_s, source_id);
int pr_source = source_url_string.indexOf("://",0);
if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));
Map<String, String> source_searchpart = source.getSearchpartMap();
if (source_searchpart == null) {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size());
if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length());
String source_host = null;
if ((source_host = source.getHost()) != null) {
String dnc = Domains.getDNC(source_host);
String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host);
if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash());
if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) {
String source_file_name = source.getFileName();
String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name);
add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name);
add(edge, WebgraphSchema.source_file_ext_s, source_file_ext);
}
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
String[] paths = source.getPaths();
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
}
if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH);
}
// add the source attributes about the target
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
ImageEntry ientry = null;
for (ImageEntry ie: images) {
if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
}
String alttext = ientry == null ? "" : ientry.alt();
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
// add the target attributes
add(edge, WebgraphSchema.target_id_s, target_id);
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
subgraph.urlAnchorTexts[ioidx].add(text);
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
Map<String, String> target_searchpart = target_url.getSearchpartMap();
if (target_searchpart == null) {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size());
if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length());
String target_host = null;
if ((target_host = target_url.getHost()) != null) {
String dnc = Domains.getDNC(target_host);
String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host);
if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash());
if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) {
String target_file_name = target_url.getFileName();
String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name);
add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name);
add(edge, WebgraphSchema.target_file_ext_s, target_file_ext);
}
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
String[] paths = target_url.getPaths();
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
}
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) {
if (target_url.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(edge, WebgraphSchema.target_clickdepth_i, 0);
this.lazy = lc;
} else {
add(edge, WebgraphSchema.target_clickdepth_i, 999);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) {
if (target_url.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(edge, WebgraphSchema.target_clickdepth_i, 0);
this.lazy = lc;
} else {
add(edge, WebgraphSchema.target_clickdepth_i, 999);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
}
if (allAttr || contains(WebgraphSchema.process_sxt)) {
List<String> pr = new ArrayList<String>();
for (ProcessType t: processTypes) pr.add(t.name());
add(edge, WebgraphSchema.process_sxt, pr);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(edge, CollectionSchema.harvestkey_s, sourceName);
}
}
if (allAttr || contains(WebgraphSchema.process_sxt)) {
List<String> pr = new ArrayList<String>();
for (ProcessType t: processTypes) pr.add(t.name());
add(edge, WebgraphSchema.process_sxt, pr);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(edge, CollectionSchema.harvestkey_s, sourceName);
}
// add the edge to the subgraph
subgraph.edges.add(edge);
}
// return the edge
return edge;
}
public int postprocessing(final Segment segment, ClickdepthCache clickdepthCache, final String harvestkey) {
if (!this.contains(WebgraphSchema.process_sxt)) return 0;
if (!segment.fulltext().writeToWebgraph()) return 0;
if (!segment.fulltext().useWebgraph()) return 0;
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
// that means we must search for those entries.
webgraphConnector.commit(true); // make sure that we have latest information that can be found

@ -174,7 +174,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
}
public int referencesCount() {
// urlCitationIndex index might be null (= configuration option)
return this.indexSegment.urlCitation() != null ? this.indexSegment.urlCitation().count(this.urlentry.hash()) : 0;
return this.indexSegment.connectedCitation() ? this.indexSegment.urlCitation().count(this.urlentry.hash()) : 0;
}
public int llocal() {
return this.urlentry.llocal();

Loading…
Cancel
Save