From b44626e55b1ef87cf199198c9bc06537c4f3902a Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Tue, 22 Jul 2014 18:24:10 +0200
Subject: [PATCH] fixed target_alt_t in webgraph

---
 .../net/yacy/cora/document/id/AnchorURL.java  |  8 ++++
 .../document/parser/html/ContentScraper.java  |  5 ++-
 .../yacy/document/parser/html/ImageEntry.java | 11 -----
 .../search/schema/WebgraphConfiguration.java  | 41 +++++++++----------
 4 files changed, 31 insertions(+), 34 deletions(-)
diff --git a/source/net/yacy/cora/document/id/AnchorURL.java b/source/net/yacy/cora/document/id/AnchorURL.java
index ea53de6cf..d51a876e3 100644
--- a/source/net/yacy/cora/document/id/AnchorURL.java
+++ b/source/net/yacy/cora/document/id/AnchorURL.java
@@ -38,6 +38,14 @@ public class AnchorURL extends DigestURL {
         this.hrefProperty = "";
     }
     
+    public AnchorURL(final AnchorURL url) {
+        super(url, url.hash());
+        this.nameProperty = url.nameProperty;
+        this.textProperty = url.textProperty;
+        this.relProperty = url.relProperty;
+        this.hrefProperty = url.hrefProperty;
+    }
+
     public AnchorURL(final DigestURL url) {
         super(url, url.hash());
         this.nameProperty = "";
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index e1d447616..cb56dc2e6 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -620,8 +620,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
         for (ImageEntry ie: scraper.images) {
             if (linkurl != null) {
-                ie.setLinkurl(linkurl);
-                ie.setAnchortext(line);
+                AnchorURL a = new AnchorURL(linkurl);
+                a.setTextProperty(line);
+                ie.setLinkurl(a);
             }
             // this image may have been added recently from the same location (as this is a recursive parse)
             // we want to keep only one of them, check if they are equal
diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java
index f1d6061d7..30d5a7d19 100644
--- a/source/net/yacy/document/parser/html/ImageEntry.java
+++ b/source/net/yacy/document/parser/html/ImageEntry.java
@@ -33,7 +33,6 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
     private final AnchorURL imageurl;
     private AnchorURL linkurl;
     private final String alt;
-    private String anchortext;
     private final int width, height;
     private final long fileSize;
 
@@ -57,7 +56,6 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
         assert imageurl != null;
         this.imageurl = imageurl;
         this.linkurl = null;
-        this.anchortext = null;
         this.alt = alt;
         this.width = width;
         this.height = height;
@@ -76,14 +74,6 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
         return this.linkurl;
     }
 
-    public void setAnchortext(String anchortext) {
-        this.anchortext = anchortext;
-    }
-
-    public String anchortext() {
-        return this.anchortext;
-    }
-
     public String alt() {
         return this.alt;
     }
@@ -102,7 +92,6 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
 
     @Override
     public String toString() {
-        if (anchortext != null) return anchortext;
         return "<img url=\"" + this.imageurl.toNormalform(false) + "\"" +
                (this.alt != null && this.alt.length() > 0 ? " alt=\"" + this.alt + "\"" : "") +
                (this.width >= 0 ? " width=\"" + this.width + "\"" : "") +
diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java
index ce7d4bb06..6b86fc27f 100644
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@@ -110,8 +110,16 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
         List<SolrInputDocument> edges = new ArrayList<SolrInputDocument>();
         for (final AnchorURL target_url: links) {
             SolrInputDocument edge = getEdge(
-                    subgraph, source, responseHeader, collections, crawldepth_source, images, processTypes,
-                    sourceName, allAttr, generalNofollow, target_order, target_url);
+                    subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
+                    sourceName, allAttr, generalNofollow, target_order, target_url, null);
+            target_order++;
+            // add the edge to the subgraph
+            edges.add(edge);
+        }
+        for (final ImageEntry image_url: images) {
+            SolrInputDocument edge = getEdge(
+                    subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
+                    sourceName, allAttr, generalNofollow, target_order, image_url.url(), image_url.alt());
             target_order++;
             // add the edge to the subgraph
             edges.add(edge);
@@ -120,10 +128,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
     }
     
     public SolrInputDocument getEdge(
-            final Subgraph subgraph,
-            final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
-            final List<ImageEntry> images, final Set<ProcessType> processTypes,
-            final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
+            final Subgraph subgraph, final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections,
+            int crawldepth_source, final Set<ProcessType> processTypes, final String sourceName, boolean allAttr, boolean generalNofollow, int target_order,
+            AnchorURL target_url, final String targetImageAlt /*only filled if target is an image, null otherwise*/) {
 
         final String name = target_url.getNameProperty(); // the name attribute
         final String text = target_url.getTextProperty(); // the text between the <a></a> tag
@@ -204,29 +211,21 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
             add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source);
         }
 
-        // parse text to find images and clear text
-        ContentScraper textContent = null;
-        try {textContent = htmlParser.parseToScraper(source_url, responseHeader.getCharacterEncoding(), text, 10);} catch (IOException e) {}
-        String extractedText = textContent.getText();
-        
         // add the source attributes about the target
         boolean inbound = CollectionConfiguration.enrichSubgraph(subgraph, source_url, target_url);
         if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
         if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
         if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
         if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
-        if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : "");
-        if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length());
-        if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0);
+        if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, target_url.getTextProperty());
+        if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, target_url.getTextProperty().length());
+        if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, target_url.getTextProperty().length() > 0 ? CommonPattern.SPACE.split(target_url.getTextProperty()).length : 0);
         
-        StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30);
-        if (textContent != null) for (ImageEntry ie: textContent.getImages()) {
-            if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' ');
+        if (targetImageAlt != null) {
+            if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, targetImageAlt);
+            if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, targetImageAlt.length());
+            if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, targetImageAlt.length() > 0 ? CommonPattern.SPACE.split(targetImageAlt).length : 0);
         }
-        while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1);
-        if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString());
-        if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
-        if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
         
         // add the target attributes
         add(edge, WebgraphSchema.target_id_s, target_id);