|
|
|
@ -110,8 +110,16 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|
|
|
|
List<SolrInputDocument> edges = new ArrayList<SolrInputDocument>();
|
|
|
|
|
for (final AnchorURL target_url: links) {
|
|
|
|
|
SolrInputDocument edge = getEdge(
|
|
|
|
|
subgraph, source, responseHeader, collections, crawldepth_source, images, processTypes,
|
|
|
|
|
sourceName, allAttr, generalNofollow, target_order, target_url);
|
|
|
|
|
subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
|
|
|
|
|
sourceName, allAttr, generalNofollow, target_order, target_url, null);
|
|
|
|
|
target_order++;
|
|
|
|
|
// add the edge to the subgraph
|
|
|
|
|
edges.add(edge);
|
|
|
|
|
}
|
|
|
|
|
for (final ImageEntry image_url: images) {
|
|
|
|
|
SolrInputDocument edge = getEdge(
|
|
|
|
|
subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
|
|
|
|
|
sourceName, allAttr, generalNofollow, target_order, image_url.url(), image_url.alt());
|
|
|
|
|
target_order++;
|
|
|
|
|
// add the edge to the subgraph
|
|
|
|
|
edges.add(edge);
|
|
|
|
@ -120,10 +128,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public SolrInputDocument getEdge(
|
|
|
|
|
final Subgraph subgraph,
|
|
|
|
|
final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
|
|
|
|
|
final List<ImageEntry> images, final Set<ProcessType> processTypes,
|
|
|
|
|
final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
|
|
|
|
|
final Subgraph subgraph, final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections,
|
|
|
|
|
int crawldepth_source, final Set<ProcessType> processTypes, final String sourceName, boolean allAttr, boolean generalNofollow, int target_order,
|
|
|
|
|
AnchorURL target_url, final String targetImageAlt /*only filled if target is an image, null otherwise*/) {
|
|
|
|
|
|
|
|
|
|
final String name = target_url.getNameProperty(); // the name attribute
|
|
|
|
|
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
|
|
|
|
@ -204,29 +211,21 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|
|
|
|
add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// parse text to find images and clear text
|
|
|
|
|
ContentScraper textContent = null;
|
|
|
|
|
try {textContent = htmlParser.parseToScraper(source_url, responseHeader.getCharacterEncoding(), text, 10);} catch (IOException e) {}
|
|
|
|
|
String extractedText = textContent.getText();
|
|
|
|
|
|
|
|
|
|
// add the source attributes about the target
|
|
|
|
|
boolean inbound = CollectionConfiguration.enrichSubgraph(subgraph, source_url, target_url);
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : "");
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length());
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0);
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, target_url.getTextProperty());
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, target_url.getTextProperty().length());
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, target_url.getTextProperty().length() > 0 ? CommonPattern.SPACE.split(target_url.getTextProperty()).length : 0);
|
|
|
|
|
|
|
|
|
|
StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30);
|
|
|
|
|
if (textContent != null) for (ImageEntry ie: textContent.getImages()) {
|
|
|
|
|
if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' ');
|
|
|
|
|
if (targetImageAlt != null) {
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, targetImageAlt);
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, targetImageAlt.length());
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, targetImageAlt.length() > 0 ? CommonPattern.SPACE.split(targetImageAlt).length : 0);
|
|
|
|
|
}
|
|
|
|
|
while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1);
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString());
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
|
|
|
|
|
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
|
|
|
|
|
|
|
|
|
|
// add the target attributes
|
|
|
|
|
add(edge, WebgraphSchema.target_id_s, target_id);
|
|
|
|
|