fix concurrency issue with htmlParser using not current scraper data

resulting in incorrect data for some html index metadata. Details see http://mantis.tokeek.de/view.php?id=717
8 years ago · 4c9be29a55
parent b154d3eb87
commit 4c9be29a55
6 changed files with 34 additions and 27 deletions
--- a/source/net/yacy/document/AbstractParser.java
+++ b/source/net/yacy/document/AbstractParser.java
@ -39,7 +39,6 @@ public abstract class AbstractParser implements Parser {
    protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>(); 
    protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
    private   final String name;
-    protected Object scraperObject; // used scraper or source object if any, otherwise null
    
    /**
     * initialize a parser with a name
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -93,7 +93,9 @@ public class Document {
    private final Set<String> languages;
    private boolean indexingDenied;
    private final double lon, lat;
-    private final Parser parserObject; // the source object that was used to create the Document
+    private final Parser parserObject; // the parser object that was used to create the Document
+    // TODO: to allow to use scraper during indexing (for some parsers) it has to be remembered here, but it holds redundant information.
+    private  Object scraperObject; // remember the source object that was used to create the Document (used during indexing)
    private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
    private final Date lastModified; // creation or last modification date of the source document
    private int crawldepth;
@ -150,8 +152,9 @@ public class Document {
        this.generic_facets = new HashMap<String, Set<String>>();
        this.lastModified = lastModified == null ? new Date() : lastModified;
        this.crawldepth = 999; // unknown yet
+        this.scraperObject = null; // will be set by setScraperObject()
    }
-    
+
    /**
     * Get the content domain of a document. This tries to get the content domain from the mime type
     * and if this fails it uses alternatively the content domain from the file extension.
@ -172,19 +175,32 @@ public class Document {
    }

    /**
-     * Confinient call to get the source/scraper object of the underlaying parser
+     * Convenient call to get the source/scraper object of the underlaying parser
     * if the parser uses a scraper, like htmlParser
     * @return scraper object typically of type ContentScraper but may also of type DCEntry
     */
    public Object getScraperObject() {
-        if (this.parserObject instanceof AbstractParser) {
-            if (((AbstractParser) this.parserObject).scraperObject != null) {
-                return ((AbstractParser) this.parserObject).scraperObject;
+        return this.scraperObject;
+    }
+
+    /**
+     * Remember the scraper object used, to be able to access used scraper by
+     * getScraperObject().
+     * This is used for surrogate parsers to set a other source/scraper then ContentScraper
+     * used e.g. by htmlParser.
+     * @param scraper 
+     */
+    public void setScraperObject(Object scraper) {
+        if (this.scraperObject != null) {
+            if (this.scraperObject instanceof ContentScraper) {
+                // support garbage collection
+                ((ContentScraper) this.scraperObject).close();
            }
+            this.scraperObject = null;
        }
-        return null;
+        this.scraperObject = scraper;
    }
-    
+
    public Set<String> getContentLanguages() {
        return this.languages;
    }
@ -978,6 +994,7 @@ dc_rights
            if (scraper instanceof ContentScraper) {
                final ContentScraper html = (ContentScraper) scraper;
                html.close();
+                doc.scraperObject = null;
            }
        }

--- a/source/net/yacy/document/content/DCEntry.java
+++ b/source/net/yacy/document/content/DCEntry.java
@ -334,9 +334,9 @@ public class DCEntry extends MultiMapSolrParams {
        t.add(getTitle());
        
        // for processing during indexing, embed entry as source scraperObject in a standard parserobj object
-        genericParser parserobj = new genericParser(this); // init the simplest parser with DCEntry as source/scraperObject used during indexing
+        genericParser parserobj = new genericParser(); // init the simplest parser with DCEntry as source/scraperObject used during indexing

-        return new Document(
+        Document document = new Document(
            getIdentifier(true),
            "text/html",
            StandardCharsets.UTF_8.name(),
@ -355,6 +355,8 @@ public class DCEntry extends MultiMapSolrParams {
            null,
            false,
            getDate());
+        document.setScraperObject(this); // TODO: used during indexing to access some possible but special YaCy meta tags in surrogate source ( <md:solrfilename>value ) -> optimize/find alternative
+        return document;
    }

    public void writeXML(OutputStreamWriter os) throws IOException {
--- a/source/net/yacy/document/parser/genericParser.java
+++ b/source/net/yacy/document/parser/genericParser.java
@ -44,17 +44,6 @@ public class genericParser extends AbstractParser implements Parser {
        // this parser is used if no other fits. This parser fits all
    }

-    /**
-     * Constructor to allow to set a scraperObject
-     * because it is desired to keep the scraper/source object protected
-     * This is used for surrogate parsers to set a other source/scraper then ContentScraper
-     * @param scraper
-     */
-    public genericParser(Object scraper) {
-        super("Generic Parser");
-        this.scraperObject = scraper;
-    }
-
    @Override
    public Document[] parse(
            final DigestURL location,
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -103,8 +103,7 @@ public class htmlParser extends AbstractParser implements Parser {
        try {
            // first get a document from the parsed html
            Charset[] detectedcharsetcontainer = new Charset[]{null};
-            scraperObject = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
-            ContentScraper scraper = (ContentScraper)scraperObject; // shortcut to access ContentScraper methodes
+            ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
            // parseToScraper also detects/corrects/sets charset from html content tag
            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
            Document documentSnapshot = null;
@ -172,8 +171,9 @@ public class htmlParser extends AbstractParser implements Parser {
                noDoubleImages,
                scraper.indexingDenied(),
                scraper.getDate());
+        ppd.setScraperObject(scraper);
        ppd.setIcons(scraper.getIcons());
-
+        
        return ppd;
    }

--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -520,7 +520,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        DigestURL canonical = null;
        
        processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values());
-        
+
        if (scraper instanceof ContentScraper) {
            final ContentScraper html = (ContentScraper) scraper;
            List<ImageEntry> images = html.getImages();
@ -829,7 +829,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            DCEntry dcentry = (DCEntry) scraper;
            for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) {
                String tag = entry.getKey();
-                if (!tag.startsWith("md:") || tag.length() < 4) continue;
+                if (!tag.startsWith("md:") || tag.length() < 4) continue; // md: is a YaCy internal identifier for metadata in surrugate.xml files ( md:SOLR_FIELDNAME )
                CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3));
                if (solr_field == null) continue;
                String[] values = entry.getValue();