From 8ebefa42332739bcd357395f1d6950971aabb24d Mon Sep 17 00:00:00 2001 From: luc Date: Tue, 8 Dec 2015 03:34:03 +0100 Subject: [PATCH] Fixed MediaWiki import : DCEntry conversion to SolrInputDocument was failing. Looks like it was broken since Commit b43811d38c426ce5237df7d37c88285644e6a544 --- .../document/content/SurrogateReader.java | 11 ++-- source/net/yacy/search/Switchboard.java | 62 +++++++++++++------ 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index 5d80a2f7b..22557a97d 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -75,7 +75,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable { private boolean parsingValue; private DCEntry dcEntry; private String elementName; - private final BlockingQueue surrogates; + /** Surrogates are either SolrInputDocument or DCEntry instances*/ + private final BlockingQueue surrogates; private SAXParser saxParser; private final InputSource inputSource; private final PushbackInputStream inputStream; @@ -145,7 +146,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { DigestURL url = new DigestURL(u); final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(url); if ( urlRejectReason == null ) { - // convert DCEntry to SolrInputDocument + // convert SolrDocument to SolrInputDocument this.surrogates.put(this.configuration.toSolrInputDocument(doc)); } } catch (MalformedURLException e) { @@ -238,8 +239,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable { // check if url is in accepted domain final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(this.dcEntry.getIdentifier(true)); if ( urlRejectReason == null ) { - // convert DCEntry to SolrInputDocument - this.surrogates.put(this.configuration.toSolrInputDocument(this.dcEntry)); + // DCEntry can not be converted to SolrInputDocument as DC schema has nothing to do with Solr collection schema + this.surrogates.put(this.dcEntry); } } catch (final InterruptedException e) { ConcurrentLog.logException(e); @@ -293,7 +294,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { } } - public SolrInputDocument take() { + public Object take() { try { return this.surrogates.take(); } catch (final InterruptedException e) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 25deaf73a..a74993809 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -158,6 +158,7 @@ import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.document.Parser.Failure; import net.yacy.document.Tokenizer; +import net.yacy.document.content.DCEntry; import net.yacy.document.content.SurrogateReader; import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.parser.audioTagParser; @@ -2012,25 +2013,50 @@ public final class Switchboard extends serverSwitch { @Override public void run() { VocabularyScraper scraper = new VocabularyScraper(); - SolrInputDocument surrogate; - while ((surrogate = reader.take()) != SurrogateReader.POISON_DOCUMENT ) { - assert surrogate != null; - try { - // enrich the surrogate - final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName()))); - final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); - if (text != null && text.length() > 0) { - // run the tokenizer on the text to get vocabularies and synonyms - final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper); - final Map> facets = Document.computeGenericFacets(tokenizer.tags()); - // overwrite the given vocabularies and synonyms with new computed ones - Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets); - } - } catch (MalformedURLException e) { - ConcurrentLog.logException(e); + Object surrogateObj; + while ((surrogateObj = reader.take()) != SurrogateReader.POISON_DOCUMENT ) { + assert surrogateObj != null; + /* When parsing a full-text Solr xml data dump Surrogate reader produces SolrInputDocument instances */ + if(surrogateObj instanceof SolrInputDocument) { + SolrInputDocument surrogate = (SolrInputDocument)surrogateObj; + try { + // enrich the surrogate + final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName()); + final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + if (text != null && text.length() > 0 && id != null ) { + final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id)); + // run the tokenizer on the text to get vocabularies and synonyms + final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper); + final Map> facets = Document.computeGenericFacets(tokenizer.tags()); + // overwrite the given vocabularies and synonyms with new computed ones + Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets); + } + } catch (MalformedURLException e) { + ConcurrentLog.logException(e); + } + // write the surrogate into the index + Switchboard.this.index.putDocument(surrogate); + } else if(surrogateObj instanceof DCEntry) { + /* When parsing a MediaWiki dump Surrogate reader produces DCEntry instances */ + // create a queue entry + final DCEntry entry = (DCEntry)surrogateObj; + final Document document = entry.document(); + final Request request = + new Request( + ASCII.getBytes(peers.mySeed().hash), + entry.getIdentifier(true), + null, + "", + entry.getDate(), + crawler.defaultSurrogateProfile.handle(), + 0, + crawler.defaultSurrogateProfile.timezoneOffset()); + final Response response = new Response(request, null, null, crawler.defaultSurrogateProfile, false, null); + final IndexingQueueEntry queueEntry = + new IndexingQueueEntry(response, new Document[] {document}, null); + + indexingCondensementProcessor.enQueue(queueEntry); } - // write the surrogate into the index - Switchboard.this.index.putDocument(surrogate); if (shallTerminate()) break; } }