Fixed MediaWiki import : DCEntry conversion to SolrInputDocument was

failing. Looks like it was broken since Commit
b43811d38c
pull/32/head
luc 9 years ago
parent 7736ee5a42
commit 8ebefa4233

@ -75,7 +75,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
private boolean parsingValue;
private DCEntry dcEntry;
private String elementName;
private final BlockingQueue<SolrInputDocument> surrogates;
/** Surrogates are either SolrInputDocument or DCEntry instances*/
private final BlockingQueue<Object> surrogates;
private SAXParser saxParser;
private final InputSource inputSource;
private final PushbackInputStream inputStream;
@ -145,7 +146,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
DigestURL url = new DigestURL(u);
final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(url);
if ( urlRejectReason == null ) {
// convert DCEntry to SolrInputDocument
// convert SolrDocument to SolrInputDocument
this.surrogates.put(this.configuration.toSolrInputDocument(doc));
}
} catch (MalformedURLException e) {
@ -238,8 +239,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
// check if url is in accepted domain
final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(this.dcEntry.getIdentifier(true));
if ( urlRejectReason == null ) {
// convert DCEntry to SolrInputDocument
this.surrogates.put(this.configuration.toSolrInputDocument(this.dcEntry));
// DCEntry can not be converted to SolrInputDocument as DC schema has nothing to do with Solr collection schema
this.surrogates.put(this.dcEntry);
}
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
@ -293,7 +294,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
}
}
public SolrInputDocument take() {
public Object take() {
try {
return this.surrogates.take();
} catch (final InterruptedException e) {

@ -158,6 +158,7 @@ import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.Parser.Failure;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.parser.audioTagParser;
@ -2012,25 +2013,50 @@ public final class Switchboard extends serverSwitch {
@Override
public void run() {
VocabularyScraper scraper = new VocabularyScraper();
SolrInputDocument surrogate;
while ((surrogate = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
assert surrogate != null;
try {
// enrich the surrogate
final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName())));
final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
if (text != null && text.length() > 0) {
// run the tokenizer on the text to get vocabularies and synonyms
final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
// overwrite the given vocabularies and synonyms with new computed ones
Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
}
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);
Object surrogateObj;
while ((surrogateObj = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
assert surrogateObj != null;
/* When parsing a full-text Solr xml data dump Surrogate reader produces SolrInputDocument instances */
if(surrogateObj instanceof SolrInputDocument) {
SolrInputDocument surrogate = (SolrInputDocument)surrogateObj;
try {
// enrich the surrogate
final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName());
final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
if (text != null && text.length() > 0 && id != null ) {
final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
// run the tokenizer on the text to get vocabularies and synonyms
final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
// overwrite the given vocabularies and synonyms with new computed ones
Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
}
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);
}
// write the surrogate into the index
Switchboard.this.index.putDocument(surrogate);
} else if(surrogateObj instanceof DCEntry) {
/* When parsing a MediaWiki dump Surrogate reader produces DCEntry instances */
// create a queue entry
final DCEntry entry = (DCEntry)surrogateObj;
final Document document = entry.document();
final Request request =
new Request(
ASCII.getBytes(peers.mySeed().hash),
entry.getIdentifier(true),
null,
"",
entry.getDate(),
crawler.defaultSurrogateProfile.handle(),
0,
crawler.defaultSurrogateProfile.timezoneOffset());
final Response response = new Response(request, null, null, crawler.defaultSurrogateProfile, false, null);
final IndexingQueueEntry queueEntry =
new IndexingQueueEntry(response, new Document[] {document}, null);
indexingCondensementProcessor.enQueue(queueEntry);
}
// write the surrogate into the index
Switchboard.this.index.putDocument(surrogate);
if (shallTerminate()) break;
}
}

Loading…
Cancel
Save