diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 01fc61757..ea2e4af1a 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -81,8 +81,8 @@ public class Crawler_p { prop.putNum("urlpublictextSize", fulltext.collectionSize()); prop.putNum("urlpublictextSegmentCount", fulltext.getDefaultConnector().getSegmentCount()); prop.put("webgraphSolrURL", fulltext.connectedLocalSolr() ? localSolr.replace("collection1", "webgraph") : remoteSolr + "webgraph/select?&q=*:*&start=0&rows=3"); - prop.putNum("webgraphSize", fulltext.writeToWebgraph() ? fulltext.webgraphSize() : 0); - prop.putNum("webgraphSegmentCount", fulltext.writeToWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0); + prop.putNum("webgraphSize", fulltext.useWebgraph() ? fulltext.webgraphSize() : 0); + prop.putNum("webgraphSegmentCount", fulltext.useWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0); prop.putNum("citationSize", segment.citationCount()); prop.putNum("citationSegmentCount", segment.citationSegmentCount()); prop.putNum("rwipublictextSize", segment.RWICount()); diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 6141c9557..6bae5637e 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -87,7 +87,7 @@ public class IndexControlURLs_p { prop.put("cleanup", post == null ? 1 : 0); prop.put("cleanup_solr", segment.fulltext().connectedRemoteSolr() ? 1 : 0); prop.put("cleanup_rwi", segment.termIndex() != null && !segment.termIndex().isEmpty() ? 1 : 0); - prop.put("cleanup_citation", segment.urlCitation() != null && !segment.urlCitation().isEmpty() ? 1 : 0); + prop.put("cleanup_citation", segment.connectedCitation() && !segment.urlCitation().isEmpty() ? 1 : 0); // show export messages final Fulltext.Export export = segment.fulltext().export(); @@ -159,7 +159,7 @@ public class IndexControlURLs_p { if (segment.termIndex() != null) try {segment.termIndex().clear();} catch (final IOException e) {} } if ( post.get("deleteCitation", "").equals("on")) { - if (segment.urlCitation() != null) try {segment.urlCitation().clear();} catch (final IOException e) {} + if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {} } if ( post.get("deleteCrawlQueues", "").equals("on") ) { sb.crawlQueues.clear(); diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index a478183ec..2dec9cf7e 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -70,7 +70,7 @@ public class IndexFederated_p { sb.index.connectCitation(wordCacheMaxCount, fileSizeMax); } catch (final IOException e) { ConcurrentLog.logException(e); } // switch on boolean webgraph = post.getBoolean(SwitchboardConstants.CORE_SERVICE_WEBGRAPH); - sb.index.fulltext().writeWebgraph(webgraph); + sb.index.fulltext().setUseWebgraph(webgraph); env.setConfig(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, webgraph); } diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index d9f52a3b2..189159ffa 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -81,8 +81,8 @@ public class status_p { // index size prop.putNum("urlpublictextSize", fulltext.collectionSize()); prop.putNum("urlpublictextSegmentCount", fulltext.getDefaultConnector().getSegmentCount()); - prop.putNum("webgraphSize", fulltext.writeToWebgraph() ? fulltext.webgraphSize() : 0); - prop.putNum("webgraphSegmentCount", fulltext.writeToWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0); + prop.putNum("webgraphSize", fulltext.useWebgraph() ? fulltext.webgraphSize() : 0); + prop.putNum("webgraphSegmentCount", fulltext.useWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0); prop.putNum("citationSize", segment.citationCount()); prop.putNum("citationSegmentCount", segment.citationSegmentCount()); prop.putNum("rwipublictextSize", segment.RWICount()); @@ -131,8 +131,8 @@ public class status_p { prop.put("postprocessingRunning", Switchboard.postprocessingRunning ? 1 : 0); - boolean processCollection = sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt) && (sb.index.connectedCitation() || sb.index.fulltext().writeToWebgraph()); - boolean processWebgraph = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.process_sxt) && sb.index.fulltext().writeToWebgraph(); + boolean processCollection = sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt) && (sb.index.connectedCitation() || sb.index.fulltext().useWebgraph()); + boolean processWebgraph = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.process_sxt) && sb.index.fulltext().useWebgraph(); long collectionTimeSinceStart = processCollection && Switchboard.postprocessingRunning ? System.currentTimeMillis() - Switchboard.postprocessingStartTime[0] : 0; long webgraphTimeSinceStart = processWebgraph && Switchboard.postprocessingRunning ? System.currentTimeMillis() - Switchboard.postprocessingStartTime[1] : 0; diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index e394fe1a1..9ef5d4373 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -126,7 +126,7 @@ public class yacydoc { prop.putXML("yacy_referrer_url", (le == null) ? "" : le.url().toNormalform(true)); prop.put("yacy_size", entry.size()); prop.put("yacy_words", entry.wordCount()); - prop.put("yacy_citations", sb.index.urlCitation()!= null ? sb.index.urlCitation().count(entry.hash()) : 0); + prop.put("yacy_citations", sb.index.connectedCitation() ? sb.index.urlCitation().count(entry.hash()) : 0); prop.put("yacy_inbound", entry.llocal()); prop.put("yacy_outbound", entry.lother()); diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java index 79b49166a..14ced0fed 100644 --- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -181,12 +181,12 @@ public class OpenSearchConnector { if (sb == null) { return false; } - final SolrConnector connector = sb.index.fulltext().writeToWebgraph() ? null : sb.index.fulltext().getWebgraphConnector(); // check if needed Solr fields are available (selected) - if (connector == null) { + if (!sb.index.fulltext().useWebgraph()) { ConcurrentLog.severe("OpenSearchConnector.Discover", "Error on connecting to embedded Solr webgraph index"); return false; } + final SolrConnector connector = sb.index.fulltext().getWebgraphConnector(); final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) && ( sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()) ) && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false); diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 7f5fedc30..9d473d22c 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -57,7 +57,6 @@ import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.document.TextParser; import net.yacy.kelondro.data.citation.CitationReference; -import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.peers.SeedDB; import net.yacy.repository.Blacklist.BlacklistType; @@ -138,11 +137,14 @@ public final class CrawlStacker { // record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument byte[] anchorhash = entry.url().hash(); - IndexCell urlCitationIndex = this.indexSegment.urlCitation(); - if (urlCitationIndex != null && entry.referrerhash() != null) try { - urlCitationIndex.add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime())); - } catch (final Exception e) { - ConcurrentLog.logException(e); + if (entry.referrerhash() != null) { + if (this.indexSegment.connectedCitation()) try { + this.indexSegment.urlCitation().add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime())); + } catch (final Exception e) { + ConcurrentLog.logException(e); + } + + // TODO: write to webgraph?? } try { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index a4590045d..094e06844 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -505,7 +505,7 @@ public final class Switchboard extends serverSwitch { this.index.connectUrlDb(this.useTailCache, this.exceed134217727); try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);} } - this.index.fulltext().writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); + this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); // set up the solr interface final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr"); @@ -1328,7 +1328,7 @@ public final class Switchboard extends serverSwitch { this.index.fulltext().connectLocalSolr(); this.index.connectUrlDb(this.useTailCache, this.exceed134217727); } - this.index.fulltext().writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); + this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); // set up the solr interface final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr"); @@ -2327,11 +2327,11 @@ public final class Switchboard extends serverSwitch { Set deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ? this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet(); int cleanupByHarvestkey = deletionCandidates.size(); - boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.writeToWebgraph()); - boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.writeToWebgraph(); + boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.useWebgraph()); + boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.useWebgraph(); if ((processCollection || processWebgraph) && (cleanupByHarvestkey > 0 || allCrawlsFinished)) { //full optimization of webgraph, if exists - if (fulltext.writeToWebgraph()) fulltext.getWebgraphConnector().optimize(1); + if (fulltext.useWebgraph()) fulltext.getWebgraphConnector().optimize(1); if (cleanupByHarvestkey > 0) { // run postprocessing on these profiles postprocessingRunning = true; diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index d9f2dc3fc..f30cfc26b 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -79,7 +79,7 @@ public class DocumentIndex extends Segment { false // exceed134217727 ); super.fulltext().connectLocalSolr(); - super.fulltext().writeWebgraph(true); + super.fulltext().setUseWebgraph(true); this.callback = callback; this.queue = new LinkedBlockingQueue(WorkflowProcessor.availableCPU * 300); this.worker = new Worker[WorkflowProcessor.availableCPU]; diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index a9f9fcd74..512772a52 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -110,11 +110,11 @@ public final class Fulltext { this.writeWebgraph = false; } - public void writeWebgraph(boolean check) { + public void setUseWebgraph(boolean check) { this.writeWebgraph = check; } - public boolean writeToWebgraph() { + public boolean useWebgraph() { return this.writeWebgraph; } @@ -403,7 +403,7 @@ public final class Fulltext { } public void putEdges(final Collection edges) throws IOException { - if (!this.writeToWebgraph()) return; + if (!this.useWebgraph()) return; if (edges == null || edges.size() == 0) return; try { this.getWebgraphConnector().add(edges); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 220597fb4..c90151822 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -382,9 +382,9 @@ public class Segment { } } catch (SpaceExceededException e) { // the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now - if (Segment.this.fulltext.writeToWebgraph()) internalIDs.clear(); + if (Segment.this.fulltext.useWebgraph()) internalIDs.clear(); } - if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.writeToWebgraph()) { + if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) { // reqd the references from the webgraph SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector(); BlockingQueue docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), 0, 10000000, 1000, 100, WebgraphSchema.source_id_s.getSolrFieldName()); @@ -663,9 +663,8 @@ public class Segment { final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName); // ENRICH DOCUMENT WITH RANKING INFORMATION - if (this.connectedCitation()) { - this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null); - } + this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null); + // STORE TO SOLR String error = null; this.putDocumentInQueue(vector); @@ -673,7 +672,7 @@ public class Segment { if (webgraph != null && webgraph.size() > 0) { // write the edges to the webgraph solr index - if (this.fulltext.writeToWebgraph()) { + if (this.fulltext.useWebgraph()) { tryloop: for (int i = 0; i < 20; i++) { try { error = null; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 136a9805d..d9d052f38 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -328,6 +328,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } + /** + * a SolrVector is a SolrInputDocument with the ability + * to store also the webgraph that is associated with + * the web document in the Solr document. + */ public static class SolrVector extends SolrInputDocument { private static final long serialVersionUID = -210901881471714939L; private List webgraphDocuments; @@ -891,9 +896,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri */ public int postprocessing(final Segment segment, ReferenceReportCache rrCache, ClickdepthCache clickdepthCache, String harvestkey) { if (!this.contains(CollectionSchema.process_sxt)) return 0; - if (!segment.connectedCitation() && !segment.fulltext().writeToWebgraph()) return 0; + if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0; SolrConnector collectionConnector = segment.fulltext().getDefaultConnector(); - SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector(); + SolrConnector webgraphConnector = segment.fulltext().useWebgraph() ? segment.fulltext().getWebgraphConnector() : null; collectionConnector.commit(false); // make sure that we have latest information that can be found if (webgraphConnector != null) webgraphConnector.commit(false); Map ranking = new TreeMap(Base64Order.enhancedCoder); diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 03aa46176..ed9063e18 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -120,185 +120,198 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial final List images, final boolean inbound, final Collection links, final String sourceName) { boolean allAttr = this.isEmpty(); - int target_order = 0; boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0; + int target_order = 0; for (final AnchorURL target_url: links) { + SolrInputDocument edge = getEdge( + subgraph, source, responseHeader, collections, clickdepth_source, images, inbound, + sourceName, allAttr, generalNofollow, target_order, target_url); + target_order++; + // add the edge to the subgraph + subgraph.edges.add(edge); + } + } + + public SolrInputDocument getEdge( + final Subgraph subgraph, + final DigestURL source, final ResponseHeader responseHeader, Map collections, int clickdepth_source, + final List images, final boolean inbound, + final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) { - Set processTypes = new LinkedHashSet(); - - final String name = target_url.getNameProperty(); // the name attribute - final String text = target_url.getTextProperty(); // the text between the tag - String rel = target_url.getRelProperty(); // the rel-attribute - int ioidx = inbound ? 0 : 1; - if (generalNofollow) { - // patch the rel attribute since the header makes nofollow valid for all links - if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; - } - - // index organization - StringBuilder idi = new StringBuilder(8); - idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase()); - while (idi.length() < 8) idi.insert(0, '0'); - String source_id = ASCII.String(source.hash()); - String target_id = ASCII.String(target_url.hash()); - StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi); - SolrInputDocument edge = new SolrInputDocument(); - add(edge, WebgraphSchema.id, id.toString()); - add(edge, WebgraphSchema.target_order_i, target_order++); - if (allAttr || contains(WebgraphSchema.load_date_dt)) { - Date loadDate = new Date(); - Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified(); - if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; - add(edge, WebgraphSchema.load_date_dt, loadDate); - } - if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified()); - final String source_url_string = source.toNormalform(false); - if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) { - List cs = new ArrayList(); - for (Map.Entry e: collections.entrySet()) { - if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey()); - } - add(edge, WebgraphSchema.collection_sxt, cs); + Set processTypes = new LinkedHashSet(); + final String name = target_url.getNameProperty(); // the name attribute + final String text = target_url.getTextProperty(); // the text between the tag + String rel = target_url.getRelProperty(); // the rel-attribute + int ioidx = inbound ? 0 : 1; + if (generalNofollow) { + // patch the rel attribute since the header makes nofollow valid for all links + if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; + } + + // index organization + StringBuilder idi = new StringBuilder(8); + idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase()); + while (idi.length() < 8) idi.insert(0, '0'); + String source_id = ASCII.String(source.hash()); + String target_id = ASCII.String(target_url.hash()); + StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi); + SolrInputDocument edge = new SolrInputDocument(); + add(edge, WebgraphSchema.id, id.toString()); + add(edge, WebgraphSchema.target_order_i, target_order); + if (allAttr || contains(WebgraphSchema.load_date_dt)) { + Date loadDate = new Date(); + Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified(); + if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; + add(edge, WebgraphSchema.load_date_dt, loadDate); + } + if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified()); + final String source_url_string = source.toNormalform(false); + if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) { + List cs = new ArrayList(); + for (Map.Entry e: collections.entrySet()) { + if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey()); } + add(edge, WebgraphSchema.collection_sxt, cs); + } - // add the source attributes - add(edge, WebgraphSchema.source_id_s, source_id); - int pr_source = source_url_string.indexOf("://",0); - if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source)); - if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3)); - Map source_searchpart = source.getSearchpartMap(); - if (source_searchpart == null) { - if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0); - } else { - if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size()); - if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()])); - if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()])); - } - if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length()); - String source_host = null; - if ((source_host = source.getHost()) != null) { - String dnc = Domains.getDNC(source_host); - String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1); - int pp = subdomOrga.lastIndexOf('.'); - String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp); - String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1); - if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host); - if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash()); - if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc); - if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga); - if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc); - if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom); - } - if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) { - String source_file_name = source.getFileName(); - String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name); - add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name); - add(edge, WebgraphSchema.source_file_ext_s, source_file_ext); - } - if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath()); - if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) { - String[] paths = source.getPaths(); - add(edge, WebgraphSchema.source_path_folders_count_i, paths.length); - add(edge, WebgraphSchema.source_path_folders_sxt, paths); - } - if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { - add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source); - if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH); - } - - // add the source attributes about the target - if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); - if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : ""); - if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : ""); - if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : "")); - if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : ""); - if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length()); - if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); - - ImageEntry ientry = null; - for (ImageEntry ie: images) { - if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;} - } - String alttext = ientry == null ? "" : ientry.alt(); - if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext); - if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length()); - if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0); - - // add the target attributes - add(edge, WebgraphSchema.target_id_s, target_id); - final String target_url_string = target_url.toNormalform(false); - int pr_target = target_url_string.indexOf("://",0); - subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target)); - subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3)); - subgraph.urlAnchorTexts[ioidx].add(text); - if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target)); - if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3)); - Map target_searchpart = target_url.getSearchpartMap(); - if (target_searchpart == null) { - if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0); - } else { - if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size()); - if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()])); - if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()])); - } - if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length()); - String target_host = null; - if ((target_host = target_url.getHost()) != null) { - String dnc = Domains.getDNC(target_host); - String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1); - int pp = subdomOrga.lastIndexOf('.'); - String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp); - String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1); - if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host); - if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash()); - if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc); - if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga); - if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc); - if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom); - } - if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) { - String target_file_name = target_url.getFileName(); - String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name); - add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name); - add(edge, WebgraphSchema.target_file_ext_s, target_file_ext); - } - if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath()); - if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) { - String[] paths = target_url.getPaths(); - add(edge, WebgraphSchema.target_path_folders_count_i, paths.length); - add(edge, WebgraphSchema.target_path_folders_sxt, paths); - } + // add the source attributes + add(edge, WebgraphSchema.source_id_s, source_id); + int pr_source = source_url_string.indexOf("://",0); + if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source)); + if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3)); + Map source_searchpart = source.getSearchpartMap(); + if (source_searchpart == null) { + if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0); + } else { + if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size()); + if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()])); + if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()])); + } + if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length()); + String source_host = null; + if ((source_host = source.getHost()) != null) { + String dnc = Domains.getDNC(source_host); + String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1); + int pp = subdomOrga.lastIndexOf('.'); + String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp); + String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1); + if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host); + if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash()); + if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc); + if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga); + if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc); + if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom); + } + if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) { + String source_file_name = source.getFileName(); + String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name); + add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name); + add(edge, WebgraphSchema.source_file_ext_s, source_file_ext); + } + if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath()); + if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) { + String[] paths = source.getPaths(); + add(edge, WebgraphSchema.source_path_folders_count_i, paths.length); + add(edge, WebgraphSchema.source_path_folders_sxt, paths); + } + if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { + add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source); + if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH); + } + + // add the source attributes about the target + if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); + if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : ""); + if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : ""); + if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : "")); + if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : ""); + if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length()); + if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); + + ImageEntry ientry = null; + for (ImageEntry ie: images) { + if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;} + } + String alttext = ientry == null ? "" : ientry.alt(); + if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext); + if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length()); + if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0); + + // add the target attributes + add(edge, WebgraphSchema.target_id_s, target_id); + final String target_url_string = target_url.toNormalform(false); + int pr_target = target_url_string.indexOf("://",0); + subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target)); + subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3)); + subgraph.urlAnchorTexts[ioidx].add(text); + if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target)); + if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3)); + Map target_searchpart = target_url.getSearchpartMap(); + if (target_searchpart == null) { + if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0); + } else { + if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size()); + if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()])); + if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()])); + } + if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length()); + String target_host = null; + if ((target_host = target_url.getHost()) != null) { + String dnc = Domains.getDNC(target_host); + String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1); + int pp = subdomOrga.lastIndexOf('.'); + String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp); + String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1); + if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host); + if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash()); + if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc); + if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga); + if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc); + if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom); + } + if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) { + String target_file_name = target_url.getFileName(); + String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name); + add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name); + add(edge, WebgraphSchema.target_file_ext_s, target_file_ext); + } + if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath()); + if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) { + String[] paths = target_url.getPaths(); + add(edge, WebgraphSchema.target_path_folders_count_i, paths.length); + add(edge, WebgraphSchema.target_path_folders_sxt, paths); + } - if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { - if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) { - if (target_url.probablyRootURL()) { - boolean lc = this.lazy; this.lazy = false; - add(edge, WebgraphSchema.target_clickdepth_i, 0); - this.lazy = lc; - } else { - add(edge, WebgraphSchema.target_clickdepth_i, 999); - processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut - } + if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { + if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) { + if (target_url.probablyRootURL()) { + boolean lc = this.lazy; this.lazy = false; + add(edge, WebgraphSchema.target_clickdepth_i, 0); + this.lazy = lc; + } else { + add(edge, WebgraphSchema.target_clickdepth_i, 999); + processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut } } - - if (allAttr || contains(WebgraphSchema.process_sxt)) { - List pr = new ArrayList(); - for (ProcessType t: processTypes) pr.add(t.name()); - add(edge, WebgraphSchema.process_sxt, pr); - if (allAttr || contains(CollectionSchema.harvestkey_s)) { - add(edge, CollectionSchema.harvestkey_s, sourceName); - } + } + + if (allAttr || contains(WebgraphSchema.process_sxt)) { + List pr = new ArrayList(); + for (ProcessType t: processTypes) pr.add(t.name()); + add(edge, WebgraphSchema.process_sxt, pr); + if (allAttr || contains(CollectionSchema.harvestkey_s)) { + add(edge, CollectionSchema.harvestkey_s, sourceName); } - - // add the edge to the subgraph - subgraph.edges.add(edge); } + + // return the edge + return edge; } + public int postprocessing(final Segment segment, ClickdepthCache clickdepthCache, final String harvestkey) { if (!this.contains(WebgraphSchema.process_sxt)) return 0; - if (!segment.fulltext().writeToWebgraph()) return 0; + if (!segment.fulltext().useWebgraph()) return 0; SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector(); // that means we must search for those entries. webgraphConnector.commit(true); // make sure that we have latest information that can be found diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java index 7da5c0858..e74f6e108 100644 --- a/source/net/yacy/search/snippet/ResultEntry.java +++ b/source/net/yacy/search/snippet/ResultEntry.java @@ -174,7 +174,7 @@ public class ResultEntry implements Comparable, Comparator