bugfix to inbound/outbound identification

pull/1/head
Michael Peter Christen 11 years ago
parent cca851a417
commit 48fbfa60c1

@ -66,7 +66,7 @@ public class webstructure {
urlhash = ASCII.getBytes(about);
hosthash = about.substring(6);
url = authenticated ? sb.getURL(urlhash) : null;
} else if (authenticated && about.length() > 0) {
} else if (about.length() > 0) {
// consider "about" as url or hostname
try {
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains

@ -859,7 +859,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// create a subgraph
if (!containsCanonical) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), sourceName);
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, document.getAnchors(), sourceName);
}
// list all links

@ -111,14 +111,14 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
public void addEdges(
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
final List<ImageEntry> images, final Collection<AnchorURL> links,
final String sourceName) {
boolean allAttr = this.isEmpty();
boolean generalNofollow = responseHeader == null ? false : responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
int target_order = 0;
for (final AnchorURL target_url: links) {
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, clickdepth_source, images, inbound,
subgraph, source, responseHeader, collections, clickdepth_source, images,
sourceName, allAttr, generalNofollow, target_order, target_url);
target_order++;
// add the edge to the subgraph
@ -129,13 +129,14 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
public SolrInputDocument getEdge(
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final List<ImageEntry> images, final boolean inbound,
final List<ImageEntry> images,
final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
String rel = target_url.getRelProperty(); // the rel-attribute
boolean inbound = target_url.getHost().equals(source.getHost()); // well, not everybody defines 'outbound' that way but however, thats used here.
int ioidx = inbound ? 0 : 1;
if (generalNofollow) {
// patch the rel attribute since the header makes nofollow valid for all links

Loading…
Cancel
Save