From 4c9be29a55b51d9937137806ed4f248875c32a2b Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 6 Jan 2017 03:01:52 +0100 Subject: [PATCH] fix concurrency issue with htmlParser using not current scraper data resulting in incorrect data for some html index metadata. Details see http://mantis.tokeek.de/view.php?id=717 --- source/net/yacy/document/AbstractParser.java | 1 - source/net/yacy/document/Document.java | 33 ++++++++++++++----- source/net/yacy/document/content/DCEntry.java | 6 ++-- .../yacy/document/parser/genericParser.java | 11 ------- .../net/yacy/document/parser/htmlParser.java | 6 ++-- .../schema/CollectionConfiguration.java | 4 +-- 6 files changed, 34 insertions(+), 27 deletions(-) diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java index daf4a91a2..b59fa1051 100644 --- a/source/net/yacy/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -39,7 +39,6 @@ public abstract class AbstractParser implements Parser { protected final Set SUPPORTED_MIME_TYPES = new LinkedHashSet(); protected final Set SUPPORTED_EXTENSIONS = new HashSet(); private final String name; - protected Object scraperObject; // used scraper or source object if any, otherwise null /** * initialize a parser with a name diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 0e8fcba25..209d30f96 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -93,7 +93,9 @@ public class Document { private final Set languages; private boolean indexingDenied; private final double lon, lat; - private final Parser parserObject; // the source object that was used to create the Document + private final Parser parserObject; // the parser object that was used to create the Document + // TODO: to allow to use scraper during indexing (for some parsers) it has to be remembered here, but it holds redundant information. + private Object scraperObject; // remember the source object that was used to create the Document (used during indexing) private final Map> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document private final Date lastModified; // creation or last modification date of the source document private int crawldepth; @@ -150,8 +152,9 @@ public class Document { this.generic_facets = new HashMap>(); this.lastModified = lastModified == null ? new Date() : lastModified; this.crawldepth = 999; // unknown yet + this.scraperObject = null; // will be set by setScraperObject() } - + /** * Get the content domain of a document. This tries to get the content domain from the mime type * and if this fails it uses alternatively the content domain from the file extension. @@ -172,19 +175,32 @@ public class Document { } /** - * Confinient call to get the source/scraper object of the underlaying parser + * Convenient call to get the source/scraper object of the underlaying parser * if the parser uses a scraper, like htmlParser * @return scraper object typically of type ContentScraper but may also of type DCEntry */ public Object getScraperObject() { - if (this.parserObject instanceof AbstractParser) { - if (((AbstractParser) this.parserObject).scraperObject != null) { - return ((AbstractParser) this.parserObject).scraperObject; + return this.scraperObject; + } + + /** + * Remember the scraper object used, to be able to access used scraper by + * getScraperObject(). + * This is used for surrogate parsers to set a other source/scraper then ContentScraper + * used e.g. by htmlParser. + * @param scraper + */ + public void setScraperObject(Object scraper) { + if (this.scraperObject != null) { + if (this.scraperObject instanceof ContentScraper) { + // support garbage collection + ((ContentScraper) this.scraperObject).close(); } + this.scraperObject = null; } - return null; + this.scraperObject = scraper; } - + public Set getContentLanguages() { return this.languages; } @@ -978,6 +994,7 @@ dc_rights if (scraper instanceof ContentScraper) { final ContentScraper html = (ContentScraper) scraper; html.close(); + doc.scraperObject = null; } } diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index 8d54f93c3..07b3f4560 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -334,9 +334,9 @@ public class DCEntry extends MultiMapSolrParams { t.add(getTitle()); // for processing during indexing, embed entry as source scraperObject in a standard parserobj object - genericParser parserobj = new genericParser(this); // init the simplest parser with DCEntry as source/scraperObject used during indexing + genericParser parserobj = new genericParser(); // init the simplest parser with DCEntry as source/scraperObject used during indexing - return new Document( + Document document = new Document( getIdentifier(true), "text/html", StandardCharsets.UTF_8.name(), @@ -355,6 +355,8 @@ public class DCEntry extends MultiMapSolrParams { null, false, getDate()); + document.setScraperObject(this); // TODO: used during indexing to access some possible but special YaCy meta tags in surrogate source ( value ) -> optimize/find alternative + return document; } public void writeXML(OutputStreamWriter os) throws IOException { diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index 1d9f2d1c8..0d6d64d6b 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -44,17 +44,6 @@ public class genericParser extends AbstractParser implements Parser { // this parser is used if no other fits. This parser fits all } - /** - * Constructor to allow to set a scraperObject - * because it is desired to keep the scraper/source object protected - * This is used for surrogate parsers to set a other source/scraper then ContentScraper - * @param scraper - */ - public genericParser(Object scraper) { - super("Generic Parser"); - this.scraperObject = scraper; - } - @Override public Document[] parse( final DigestURL location, diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 12a00fe30..56586e6d1 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -103,8 +103,7 @@ public class htmlParser extends AbstractParser implements Parser { try { // first get a document from the parsed html Charset[] detectedcharsetcontainer = new Charset[]{null}; - scraperObject = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); - ContentScraper scraper = (ContentScraper)scraperObject; // shortcut to access ContentScraper methodes + ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); // parseToScraper also detects/corrects/sets charset from html content tag final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); Document documentSnapshot = null; @@ -172,8 +171,9 @@ public class htmlParser extends AbstractParser implements Parser { noDoubleImages, scraper.indexingDenied(), scraper.getDate()); + ppd.setScraperObject(scraper); ppd.setIcons(scraper.getIcons()); - + return ppd; } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 556275392..8317e6729 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -520,7 +520,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri DigestURL canonical = null; processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values()); - + if (scraper instanceof ContentScraper) { final ContentScraper html = (ContentScraper) scraper; List images = html.getImages(); @@ -829,7 +829,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri DCEntry dcentry = (DCEntry) scraper; for (Map.Entry entry: dcentry.getMap().entrySet()) { String tag = entry.getKey(); - if (!tag.startsWith("md:") || tag.length() < 4) continue; + if (!tag.startsWith("md:") || tag.length() < 4) continue; // md: is a YaCy internal identifier for metadata in surrugate.xml files ( md:SOLR_FIELDNAME ) CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3)); if (solr_field == null) continue; String[] values = entry.getValue();