From b44626e55b1ef87cf199198c9bc06537c4f3902a Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 22 Jul 2014 18:24:10 +0200 Subject: [PATCH] fixed target_alt_t in webgraph --- .../net/yacy/cora/document/id/AnchorURL.java | 8 ++++ .../document/parser/html/ContentScraper.java | 5 ++- .../yacy/document/parser/html/ImageEntry.java | 11 ----- .../search/schema/WebgraphConfiguration.java | 41 +++++++++---------- 4 files changed, 31 insertions(+), 34 deletions(-) diff --git a/source/net/yacy/cora/document/id/AnchorURL.java b/source/net/yacy/cora/document/id/AnchorURL.java index ea53de6cf..d51a876e3 100644 --- a/source/net/yacy/cora/document/id/AnchorURL.java +++ b/source/net/yacy/cora/document/id/AnchorURL.java @@ -38,6 +38,14 @@ public class AnchorURL extends DigestURL { this.hrefProperty = ""; } + public AnchorURL(final AnchorURL url) { + super(url, url.hash()); + this.nameProperty = url.nameProperty; + this.textProperty = url.textProperty; + this.relProperty = url.relProperty; + this.hrefProperty = url.hrefProperty; + } + public AnchorURL(final DigestURL url) { super(url, url.hash()); this.nameProperty = ""; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index e1d447616..cb56dc2e6 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -620,8 +620,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); for (ImageEntry ie: scraper.images) { if (linkurl != null) { - ie.setLinkurl(linkurl); - ie.setAnchortext(line); + AnchorURL a = new AnchorURL(linkurl); + a.setTextProperty(line); + ie.setLinkurl(a); } // this image may have been added recently from the same location (as this is a recursive parse) // we want to keep only one of them, check if they are equal diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java index f1d6061d7..30d5a7d19 100644 --- a/source/net/yacy/document/parser/html/ImageEntry.java +++ b/source/net/yacy/document/parser/html/ImageEntry.java @@ -33,7 +33,6 @@ public class ImageEntry implements Comparable, Comparator, Comparator, Comparator, Comparator 0 ? " alt=\"" + this.alt + "\"" : "") + (this.width >= 0 ? " width=\"" + this.width + "\"" : "") + diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index ce7d4bb06..6b86fc27f 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -110,8 +110,16 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial List edges = new ArrayList(); for (final AnchorURL target_url: links) { SolrInputDocument edge = getEdge( - subgraph, source, responseHeader, collections, crawldepth_source, images, processTypes, - sourceName, allAttr, generalNofollow, target_order, target_url); + subgraph, source, responseHeader, collections, crawldepth_source, processTypes, + sourceName, allAttr, generalNofollow, target_order, target_url, null); + target_order++; + // add the edge to the subgraph + edges.add(edge); + } + for (final ImageEntry image_url: images) { + SolrInputDocument edge = getEdge( + subgraph, source, responseHeader, collections, crawldepth_source, processTypes, + sourceName, allAttr, generalNofollow, target_order, image_url.url(), image_url.alt()); target_order++; // add the edge to the subgraph edges.add(edge); @@ -120,10 +128,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } public SolrInputDocument getEdge( - final Subgraph subgraph, - final DigestURL source_url, final ResponseHeader responseHeader, Map collections, int crawldepth_source, - final List images, final Set processTypes, - final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) { + final Subgraph subgraph, final DigestURL source_url, final ResponseHeader responseHeader, Map collections, + int crawldepth_source, final Set processTypes, final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, + AnchorURL target_url, final String targetImageAlt /*only filled if target is an image, null otherwise*/) { final String name = target_url.getNameProperty(); // the name attribute final String text = target_url.getTextProperty(); // the text between the tag @@ -204,29 +211,21 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source); } - // parse text to find images and clear text - ContentScraper textContent = null; - try {textContent = htmlParser.parseToScraper(source_url, responseHeader.getCharacterEncoding(), text, 10);} catch (IOException e) {} - String extractedText = textContent.getText(); - // add the source attributes about the target boolean inbound = CollectionConfiguration.enrichSubgraph(subgraph, source_url, target_url); if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : ""); if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : ""); if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : "")); - if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : ""); - if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length()); - if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0); + if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, target_url.getTextProperty()); + if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, target_url.getTextProperty().length()); + if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, target_url.getTextProperty().length() > 0 ? CommonPattern.SPACE.split(target_url.getTextProperty()).length : 0); - StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30); - if (textContent != null) for (ImageEntry ie: textContent.getImages()) { - if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' '); + if (targetImageAlt != null) { + if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, targetImageAlt); + if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, targetImageAlt.length()); + if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, targetImageAlt.length() > 0 ? CommonPattern.SPACE.split(targetImageAlt).length : 0); } - while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1); - if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString()); - if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length()); - if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0); // add the target attributes add(edge, WebgraphSchema.target_id_s, target_id);