From 4c9be29a55b51d9937137806ed4f248875c32a2b Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Fri, 6 Jan 2017 03:01:52 +0100
Subject: [PATCH] fix concurrency issue with htmlParser using not current
 scraper data resulting in incorrect data for some html index metadata.
 Details see http://mantis.tokeek.de/view.php?id=717

---
 source/net/yacy/document/AbstractParser.java  |  1 -
 source/net/yacy/document/Document.java        | 33 ++++++++++++++-----
 source/net/yacy/document/content/DCEntry.java |  6 ++--
 .../yacy/document/parser/genericParser.java   | 11 -------
 .../net/yacy/document/parser/htmlParser.java  |  6 ++--
 .../schema/CollectionConfiguration.java       |  4 +--
 6 files changed, 34 insertions(+), 27 deletions(-)
diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java
index daf4a91a2..b59fa1051 100644
--- a/source/net/yacy/document/AbstractParser.java
+++ b/source/net/yacy/document/AbstractParser.java
@@ -39,7 +39,6 @@ public abstract class AbstractParser implements Parser {
     protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>(); 
     protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
     private   final String name;
-    protected Object scraperObject; // used scraper or source object if any, otherwise null
     
     /**
      * initialize a parser with a name
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index 0e8fcba25..209d30f96 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -93,7 +93,9 @@ public class Document {
     private final Set<String> languages;
     private boolean indexingDenied;
     private final double lon, lat;
-    private final Parser parserObject; // the source object that was used to create the Document
+    private final Parser parserObject; // the parser object that was used to create the Document
+    // TODO: to allow to use scraper during indexing (for some parsers) it has to be remembered here, but it holds redundant information.
+    private  Object scraperObject; // remember the source object that was used to create the Document (used during indexing)
     private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
     private final Date lastModified; // creation or last modification date of the source document
     private int crawldepth;
@@ -150,8 +152,9 @@ public class Document {
         this.generic_facets = new HashMap<String, Set<String>>();
         this.lastModified = lastModified == null ? new Date() : lastModified;
         this.crawldepth = 999; // unknown yet
+        this.scraperObject = null; // will be set by setScraperObject()
     }
-    
+
     /**
      * Get the content domain of a document. This tries to get the content domain from the mime type
      * and if this fails it uses alternatively the content domain from the file extension.
@@ -172,19 +175,32 @@ public class Document {
     }
 
     /**
-     * Confinient call to get the source/scraper object of the underlaying parser
+     * Convenient call to get the source/scraper object of the underlaying parser
      * if the parser uses a scraper, like htmlParser
      * @return scraper object typically of type ContentScraper but may also of type DCEntry
      */
     public Object getScraperObject() {
-        if (this.parserObject instanceof AbstractParser) {
-            if (((AbstractParser) this.parserObject).scraperObject != null) {
-                return ((AbstractParser) this.parserObject).scraperObject;
+        return this.scraperObject;
+    }
+
+    /**
+     * Remember the scraper object used, to be able to access used scraper by
+     * getScraperObject().
+     * This is used for surrogate parsers to set a other source/scraper then ContentScraper
+     * used e.g. by htmlParser.
+     * @param scraper 
+     */
+    public void setScraperObject(Object scraper) {
+        if (this.scraperObject != null) {
+            if (this.scraperObject instanceof ContentScraper) {
+                // support garbage collection
+                ((ContentScraper) this.scraperObject).close();
             }
+            this.scraperObject = null;
         }
-        return null;
+        this.scraperObject = scraper;
     }
-    
+
     public Set<String> getContentLanguages() {
         return this.languages;
     }
@@ -978,6 +994,7 @@ dc_rights
             if (scraper instanceof ContentScraper) {
                 final ContentScraper html = (ContentScraper) scraper;
                 html.close();
+                doc.scraperObject = null;
             }
         }
 
diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java
index 8d54f93c3..07b3f4560 100644
--- a/source/net/yacy/document/content/DCEntry.java
+++ b/source/net/yacy/document/content/DCEntry.java
@@ -334,9 +334,9 @@ public class DCEntry extends MultiMapSolrParams {
         t.add(getTitle());
         
         // for processing during indexing, embed entry as source scraperObject in a standard parserobj object
-        genericParser parserobj = new genericParser(this); // init the simplest parser with DCEntry as source/scraperObject used during indexing
+        genericParser parserobj = new genericParser(); // init the simplest parser with DCEntry as source/scraperObject used during indexing
 
-        return new Document(
+        Document document = new Document(
             getIdentifier(true),
             "text/html",
             StandardCharsets.UTF_8.name(),
@@ -355,6 +355,8 @@ public class DCEntry extends MultiMapSolrParams {
             null,
             false,
             getDate());
+        document.setScraperObject(this); // TODO: used during indexing to access some possible but special YaCy meta tags in surrogate source ( <md:solrfilename>value ) -> optimize/find alternative
+        return document;
     }
 
     public void writeXML(OutputStreamWriter os) throws IOException {
diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java
index 1d9f2d1c8..0d6d64d6b 100644
--- a/source/net/yacy/document/parser/genericParser.java
+++ b/source/net/yacy/document/parser/genericParser.java
@@ -44,17 +44,6 @@ public class genericParser extends AbstractParser implements Parser {
         // this parser is used if no other fits. This parser fits all
     }
 
-    /**
-     * Constructor to allow to set a scraperObject
-     * because it is desired to keep the scraper/source object protected
-     * This is used for surrogate parsers to set a other source/scraper then ContentScraper
-     * @param scraper
-     */
-    public genericParser(Object scraper) {
-        super("Generic Parser");
-        this.scraperObject = scraper;
-    }
-
     @Override
     public Document[] parse(
             final DigestURL location,
diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java
index 12a00fe30..56586e6d1 100644
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@@ -103,8 +103,7 @@ public class htmlParser extends AbstractParser implements Parser {
         try {
             // first get a document from the parsed html
             Charset[] detectedcharsetcontainer = new Charset[]{null};
-            scraperObject = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
-            ContentScraper scraper = (ContentScraper)scraperObject; // shortcut to access ContentScraper methodes
+            ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
             // parseToScraper also detects/corrects/sets charset from html content tag
             final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
             Document documentSnapshot = null;
@@ -172,8 +171,9 @@ public class htmlParser extends AbstractParser implements Parser {
                 noDoubleImages,
                 scraper.indexingDenied(),
                 scraper.getDate());
+        ppd.setScraperObject(scraper);
         ppd.setIcons(scraper.getIcons());
-
+        
         return ppd;
     }
 
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index 556275392..8317e6729 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -520,7 +520,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
         DigestURL canonical = null;
         
         processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values());
-        
+
         if (scraper instanceof ContentScraper) {
             final ContentScraper html = (ContentScraper) scraper;
             List<ImageEntry> images = html.getImages();
@@ -829,7 +829,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
             DCEntry dcentry = (DCEntry) scraper;
             for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) {
                 String tag = entry.getKey();
-                if (!tag.startsWith("md:") || tag.length() < 4) continue;
+                if (!tag.startsWith("md:") || tag.length() < 4) continue; // md: is a YaCy internal identifier for metadata in surrugate.xml files ( md:SOLR_FIELDNAME )
                 CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3));
                 if (solr_field == null) continue;
                 String[] values = entry.getValue();