fix concurrency issue with htmlParser using not current scraper data

resulting in incorrect data for some html index metadata.
Details see http://mantis.tokeek.de/view.php?id=717
pull/105/head
reger 8 years ago
parent b154d3eb87
commit 4c9be29a55

@ -39,7 +39,6 @@ public abstract class AbstractParser implements Parser {
protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>();
protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
private final String name;
protected Object scraperObject; // used scraper or source object if any, otherwise null
/**
* initialize a parser with a name

@ -93,7 +93,9 @@ public class Document {
private final Set<String> languages;
private boolean indexingDenied;
private final double lon, lat;
private final Parser parserObject; // the source object that was used to create the Document
private final Parser parserObject; // the parser object that was used to create the Document
// TODO: to allow to use scraper during indexing (for some parsers) it has to be remembered here, but it holds redundant information.
private Object scraperObject; // remember the source object that was used to create the Document (used during indexing)
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date lastModified; // creation or last modification date of the source document
private int crawldepth;
@ -150,8 +152,9 @@ public class Document {
this.generic_facets = new HashMap<String, Set<String>>();
this.lastModified = lastModified == null ? new Date() : lastModified;
this.crawldepth = 999; // unknown yet
this.scraperObject = null; // will be set by setScraperObject()
}
/**
* Get the content domain of a document. This tries to get the content domain from the mime type
* and if this fails it uses alternatively the content domain from the file extension.
@ -172,19 +175,32 @@ public class Document {
}
/**
* Confinient call to get the source/scraper object of the underlaying parser
* Convenient call to get the source/scraper object of the underlaying parser
* if the parser uses a scraper, like htmlParser
* @return scraper object typically of type ContentScraper but may also of type DCEntry
*/
public Object getScraperObject() {
if (this.parserObject instanceof AbstractParser) {
if (((AbstractParser) this.parserObject).scraperObject != null) {
return ((AbstractParser) this.parserObject).scraperObject;
return this.scraperObject;
}
/**
* Remember the scraper object used, to be able to access used scraper by
* getScraperObject().
* This is used for surrogate parsers to set a other source/scraper then ContentScraper
* used e.g. by htmlParser.
* @param scraper
*/
public void setScraperObject(Object scraper) {
if (this.scraperObject != null) {
if (this.scraperObject instanceof ContentScraper) {
// support garbage collection
((ContentScraper) this.scraperObject).close();
}
this.scraperObject = null;
}
return null;
this.scraperObject = scraper;
}
public Set<String> getContentLanguages() {
return this.languages;
}
@ -978,6 +994,7 @@ dc_rights
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
html.close();
doc.scraperObject = null;
}
}

@ -334,9 +334,9 @@ public class DCEntry extends MultiMapSolrParams {
t.add(getTitle());
// for processing during indexing, embed entry as source scraperObject in a standard parserobj object
genericParser parserobj = new genericParser(this); // init the simplest parser with DCEntry as source/scraperObject used during indexing
genericParser parserobj = new genericParser(); // init the simplest parser with DCEntry as source/scraperObject used during indexing
return new Document(
Document document = new Document(
getIdentifier(true),
"text/html",
StandardCharsets.UTF_8.name(),
@ -355,6 +355,8 @@ public class DCEntry extends MultiMapSolrParams {
null,
false,
getDate());
document.setScraperObject(this); // TODO: used during indexing to access some possible but special YaCy meta tags in surrogate source ( <md:solrfilename>value ) -> optimize/find alternative
return document;
}
public void writeXML(OutputStreamWriter os) throws IOException {

@ -44,17 +44,6 @@ public class genericParser extends AbstractParser implements Parser {
// this parser is used if no other fits. This parser fits all
}
/**
* Constructor to allow to set a scraperObject
* because it is desired to keep the scraper/source object protected
* This is used for surrogate parsers to set a other source/scraper then ContentScraper
* @param scraper
*/
public genericParser(Object scraper) {
super("Generic Parser");
this.scraperObject = scraper;
}
@Override
public Document[] parse(
final DigestURL location,

@ -103,8 +103,7 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
scraperObject = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
ContentScraper scraper = (ContentScraper)scraperObject; // shortcut to access ContentScraper methodes
ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null;
@ -172,8 +171,9 @@ public class htmlParser extends AbstractParser implements Parser {
noDoubleImages,
scraper.indexingDenied(),
scraper.getDate());
ppd.setScraperObject(scraper);
ppd.setIcons(scraper.getIcons());
return ppd;
}

@ -520,7 +520,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
DigestURL canonical = null;
processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values());
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
List<ImageEntry> images = html.getImages();
@ -829,7 +829,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
DCEntry dcentry = (DCEntry) scraper;
for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) {
String tag = entry.getKey();
if (!tag.startsWith("md:") || tag.length() < 4) continue;
if (!tag.startsWith("md:") || tag.length() < 4) continue; // md: is a YaCy internal identifier for metadata in surrugate.xml files ( md:SOLR_FIELDNAME )
CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3));
if (solr_field == null) continue;
String[] values = entry.getValue();

Loading…
Cancel
Save