fix concurrency issue with htmlParser using not current scraper data

resulting in incorrect data for some html index metadata.
Details see http://mantis.tokeek.de/view.php?id=717
pull/105/head
reger 8 years ago
parent b154d3eb87
commit 4c9be29a55

@ -39,7 +39,6 @@ public abstract class AbstractParser implements Parser {
protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>(); protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>();
protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>(); protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
private final String name; private final String name;
protected Object scraperObject; // used scraper or source object if any, otherwise null
/** /**
* initialize a parser with a name * initialize a parser with a name

@ -93,7 +93,9 @@ public class Document {
private final Set<String> languages; private final Set<String> languages;
private boolean indexingDenied; private boolean indexingDenied;
private final double lon, lat; private final double lon, lat;
private final Parser parserObject; // the source object that was used to create the Document private final Parser parserObject; // the parser object that was used to create the Document
// TODO: to allow to use scraper during indexing (for some parsers) it has to be remembered here, but it holds redundant information.
private Object scraperObject; // remember the source object that was used to create the Document (used during indexing)
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date lastModified; // creation or last modification date of the source document private final Date lastModified; // creation or last modification date of the source document
private int crawldepth; private int crawldepth;
@ -150,8 +152,9 @@ public class Document {
this.generic_facets = new HashMap<String, Set<String>>(); this.generic_facets = new HashMap<String, Set<String>>();
this.lastModified = lastModified == null ? new Date() : lastModified; this.lastModified = lastModified == null ? new Date() : lastModified;
this.crawldepth = 999; // unknown yet this.crawldepth = 999; // unknown yet
this.scraperObject = null; // will be set by setScraperObject()
} }
/** /**
* Get the content domain of a document. This tries to get the content domain from the mime type * Get the content domain of a document. This tries to get the content domain from the mime type
* and if this fails it uses alternatively the content domain from the file extension. * and if this fails it uses alternatively the content domain from the file extension.
@ -172,19 +175,32 @@ public class Document {
} }
/** /**
* Confinient call to get the source/scraper object of the underlaying parser * Convenient call to get the source/scraper object of the underlaying parser
* if the parser uses a scraper, like htmlParser * if the parser uses a scraper, like htmlParser
* @return scraper object typically of type ContentScraper but may also of type DCEntry * @return scraper object typically of type ContentScraper but may also of type DCEntry
*/ */
public Object getScraperObject() { public Object getScraperObject() {
if (this.parserObject instanceof AbstractParser) { return this.scraperObject;
if (((AbstractParser) this.parserObject).scraperObject != null) { }
return ((AbstractParser) this.parserObject).scraperObject;
/**
* Remember the scraper object used, to be able to access used scraper by
* getScraperObject().
* This is used for surrogate parsers to set a other source/scraper then ContentScraper
* used e.g. by htmlParser.
* @param scraper
*/
public void setScraperObject(Object scraper) {
if (this.scraperObject != null) {
if (this.scraperObject instanceof ContentScraper) {
// support garbage collection
((ContentScraper) this.scraperObject).close();
} }
this.scraperObject = null;
} }
return null; this.scraperObject = scraper;
} }
public Set<String> getContentLanguages() { public Set<String> getContentLanguages() {
return this.languages; return this.languages;
} }
@ -978,6 +994,7 @@ dc_rights
if (scraper instanceof ContentScraper) { if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper; final ContentScraper html = (ContentScraper) scraper;
html.close(); html.close();
doc.scraperObject = null;
} }
} }

@ -334,9 +334,9 @@ public class DCEntry extends MultiMapSolrParams {
t.add(getTitle()); t.add(getTitle());
// for processing during indexing, embed entry as source scraperObject in a standard parserobj object // for processing during indexing, embed entry as source scraperObject in a standard parserobj object
genericParser parserobj = new genericParser(this); // init the simplest parser with DCEntry as source/scraperObject used during indexing genericParser parserobj = new genericParser(); // init the simplest parser with DCEntry as source/scraperObject used during indexing
return new Document( Document document = new Document(
getIdentifier(true), getIdentifier(true),
"text/html", "text/html",
StandardCharsets.UTF_8.name(), StandardCharsets.UTF_8.name(),
@ -355,6 +355,8 @@ public class DCEntry extends MultiMapSolrParams {
null, null,
false, false,
getDate()); getDate());
document.setScraperObject(this); // TODO: used during indexing to access some possible but special YaCy meta tags in surrogate source ( <md:solrfilename>value ) -> optimize/find alternative
return document;
} }
public void writeXML(OutputStreamWriter os) throws IOException { public void writeXML(OutputStreamWriter os) throws IOException {

@ -44,17 +44,6 @@ public class genericParser extends AbstractParser implements Parser {
// this parser is used if no other fits. This parser fits all // this parser is used if no other fits. This parser fits all
} }
/**
* Constructor to allow to set a scraperObject
* because it is desired to keep the scraper/source object protected
* This is used for surrogate parsers to set a other source/scraper then ContentScraper
* @param scraper
*/
public genericParser(Object scraper) {
super("Generic Parser");
this.scraperObject = scraper;
}
@Override @Override
public Document[] parse( public Document[] parse(
final DigestURL location, final DigestURL location,

@ -103,8 +103,7 @@ public class htmlParser extends AbstractParser implements Parser {
try { try {
// first get a document from the parsed html // first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
scraperObject = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks); ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
ContentScraper scraper = (ContentScraper)scraperObject; // shortcut to access ContentScraper methodes
// parseToScraper also detects/corrects/sets charset from html content tag // parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null; Document documentSnapshot = null;
@ -172,8 +171,9 @@ public class htmlParser extends AbstractParser implements Parser {
noDoubleImages, noDoubleImages,
scraper.indexingDenied(), scraper.indexingDenied(),
scraper.getDate()); scraper.getDate());
ppd.setScraperObject(scraper);
ppd.setIcons(scraper.getIcons()); ppd.setIcons(scraper.getIcons());
return ppd; return ppd;
} }

@ -520,7 +520,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
DigestURL canonical = null; DigestURL canonical = null;
processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values()); processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values());
if (scraper instanceof ContentScraper) { if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper; final ContentScraper html = (ContentScraper) scraper;
List<ImageEntry> images = html.getImages(); List<ImageEntry> images = html.getImages();
@ -829,7 +829,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
DCEntry dcentry = (DCEntry) scraper; DCEntry dcentry = (DCEntry) scraper;
for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) { for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) {
String tag = entry.getKey(); String tag = entry.getKey();
if (!tag.startsWith("md:") || tag.length() < 4) continue; if (!tag.startsWith("md:") || tag.length() < 4) continue; // md: is a YaCy internal identifier for metadata in surrugate.xml files ( md:SOLR_FIELDNAME )
CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3)); CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3));
if (solr_field == null) continue; if (solr_field == null) continue;
String[] values = entry.getValue(); String[] values = entry.getValue();

Loading…
Cancel
Save