fixed target_alt_t in webgraph

pull/1/head
Michael Peter Christen 11 years ago
parent 504327b15c
commit b44626e55b

@ -38,6 +38,14 @@ public class AnchorURL extends DigestURL {
this.hrefProperty = ""; this.hrefProperty = "";
} }
public AnchorURL(final AnchorURL url) {
super(url, url.hash());
this.nameProperty = url.nameProperty;
this.textProperty = url.textProperty;
this.relProperty = url.relProperty;
this.hrefProperty = url.hrefProperty;
}
public AnchorURL(final DigestURL url) { public AnchorURL(final DigestURL url) {
super(url, url.hash()); super(url, url.hash());
this.nameProperty = ""; this.nameProperty = "";

@ -620,8 +620,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
for (ImageEntry ie: scraper.images) { for (ImageEntry ie: scraper.images) {
if (linkurl != null) { if (linkurl != null) {
ie.setLinkurl(linkurl); AnchorURL a = new AnchorURL(linkurl);
ie.setAnchortext(line); a.setTextProperty(line);
ie.setLinkurl(a);
} }
// this image may have been added recently from the same location (as this is a recursive parse) // this image may have been added recently from the same location (as this is a recursive parse)
// we want to keep only one of them, check if they are equal // we want to keep only one of them, check if they are equal

@ -33,7 +33,6 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
private final AnchorURL imageurl; private final AnchorURL imageurl;
private AnchorURL linkurl; private AnchorURL linkurl;
private final String alt; private final String alt;
private String anchortext;
private final int width, height; private final int width, height;
private final long fileSize; private final long fileSize;
@ -57,7 +56,6 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
assert imageurl != null; assert imageurl != null;
this.imageurl = imageurl; this.imageurl = imageurl;
this.linkurl = null; this.linkurl = null;
this.anchortext = null;
this.alt = alt; this.alt = alt;
this.width = width; this.width = width;
this.height = height; this.height = height;
@ -76,14 +74,6 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
return this.linkurl; return this.linkurl;
} }
public void setAnchortext(String anchortext) {
this.anchortext = anchortext;
}
public String anchortext() {
return this.anchortext;
}
public String alt() { public String alt() {
return this.alt; return this.alt;
} }
@ -102,7 +92,6 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
@Override @Override
public String toString() { public String toString() {
if (anchortext != null) return anchortext;
return "<img url=\"" + this.imageurl.toNormalform(false) + "\"" + return "<img url=\"" + this.imageurl.toNormalform(false) + "\"" +
(this.alt != null && this.alt.length() > 0 ? " alt=\"" + this.alt + "\"" : "") + (this.alt != null && this.alt.length() > 0 ? " alt=\"" + this.alt + "\"" : "") +
(this.width >= 0 ? " width=\"" + this.width + "\"" : "") + (this.width >= 0 ? " width=\"" + this.width + "\"" : "") +

@ -110,8 +110,16 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
List<SolrInputDocument> edges = new ArrayList<SolrInputDocument>(); List<SolrInputDocument> edges = new ArrayList<SolrInputDocument>();
for (final AnchorURL target_url: links) { for (final AnchorURL target_url: links) {
SolrInputDocument edge = getEdge( SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, crawldepth_source, images, processTypes, subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
sourceName, allAttr, generalNofollow, target_order, target_url); sourceName, allAttr, generalNofollow, target_order, target_url, null);
target_order++;
// add the edge to the subgraph
edges.add(edge);
}
for (final ImageEntry image_url: images) {
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
sourceName, allAttr, generalNofollow, target_order, image_url.url(), image_url.alt());
target_order++; target_order++;
// add the edge to the subgraph // add the edge to the subgraph
edges.add(edge); edges.add(edge);
@ -120,10 +128,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
} }
public SolrInputDocument getEdge( public SolrInputDocument getEdge(
final Subgraph subgraph, final Subgraph subgraph, final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections,
final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source, int crawldepth_source, final Set<ProcessType> processTypes, final String sourceName, boolean allAttr, boolean generalNofollow, int target_order,
final List<ImageEntry> images, final Set<ProcessType> processTypes, AnchorURL target_url, final String targetImageAlt /*only filled if target is an image, null otherwise*/) {
final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
final String name = target_url.getNameProperty(); // the name attribute final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag final String text = target_url.getTextProperty(); // the text between the <a></a> tag
@ -204,29 +211,21 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source); add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source);
} }
// parse text to find images and clear text
ContentScraper textContent = null;
try {textContent = htmlParser.parseToScraper(source_url, responseHeader.getCharacterEncoding(), text, 10);} catch (IOException e) {}
String extractedText = textContent.getText();
// add the source attributes about the target // add the source attributes about the target
boolean inbound = CollectionConfiguration.enrichSubgraph(subgraph, source_url, target_url); boolean inbound = CollectionConfiguration.enrichSubgraph(subgraph, source_url, target_url);
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : ""); if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : ""); if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : "")); if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : ""); if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, target_url.getTextProperty());
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length()); if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, target_url.getTextProperty().length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0); if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, target_url.getTextProperty().length() > 0 ? CommonPattern.SPACE.split(target_url.getTextProperty()).length : 0);
StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30); if (targetImageAlt != null) {
if (textContent != null) for (ImageEntry ie: textContent.getImages()) { if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, targetImageAlt);
if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' '); if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, targetImageAlt.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, targetImageAlt.length() > 0 ? CommonPattern.SPACE.split(targetImageAlt).length : 0);
} }
while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1);
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString());
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
// add the target attributes // add the target attributes
add(edge, WebgraphSchema.target_id_s, target_id); add(edge, WebgraphSchema.target_id_s, target_id);

Loading…
Cancel
Save