From 8ebefa42332739bcd357395f1d6950971aabb24d Mon Sep 17 00:00:00 2001
From: luc <luc@debianluc>
Date: Tue, 8 Dec 2015 03:34:03 +0100
Subject: [PATCH] Fixed MediaWiki import : DCEntry conversion to
 SolrInputDocument was failing. Looks like it was broken since Commit
 b43811d38c426ce5237df7d37c88285644e6a544

---
 .../document/content/SurrogateReader.java     | 11 ++--
 source/net/yacy/search/Switchboard.java       | 62 +++++++++++++------
 2 files changed, 50 insertions(+), 23 deletions(-)
diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java
index 5d80a2f7b..22557a97d 100644
--- a/source/net/yacy/document/content/SurrogateReader.java
+++ b/source/net/yacy/document/content/SurrogateReader.java
@@ -75,7 +75,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
     private boolean parsingValue;
     private DCEntry dcEntry;
     private String elementName;
-    private final BlockingQueue<SolrInputDocument> surrogates;
+    /** Surrogates are either SolrInputDocument or DCEntry instances*/
+    private final BlockingQueue<Object> surrogates;
     private SAXParser saxParser;
     private final InputSource inputSource;
     private final PushbackInputStream inputStream;
@@ -145,7 +146,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
                                 DigestURL url = new DigestURL(u);
                                 final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(url);
                                 if ( urlRejectReason == null ) {
-                                    // convert DCEntry to SolrInputDocument
+                                    // convert SolrDocument to SolrInputDocument
                                     this.surrogates.put(this.configuration.toSolrInputDocument(doc));
                                 }
                             } catch (MalformedURLException e) {
@@ -238,8 +239,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
                 // check if url is in accepted domain
                 final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(this.dcEntry.getIdentifier(true));
                 if ( urlRejectReason == null ) {
-                    // convert DCEntry to SolrInputDocument
-                    this.surrogates.put(this.configuration.toSolrInputDocument(this.dcEntry));
+                    // DCEntry can not be converted to SolrInputDocument as DC schema has nothing to do with Solr collection schema
+                    this.surrogates.put(this.dcEntry);
                 }
             } catch (final InterruptedException e) {
                 ConcurrentLog.logException(e);
@@ -293,7 +294,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
         }
     }
 
-    public SolrInputDocument take() {
+    public Object take() {
         try {
             return this.surrogates.take();
         } catch (final InterruptedException e) {
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 25deaf73a..a74993809 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -158,6 +158,7 @@ import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.Parser.Failure;
 import net.yacy.document.Tokenizer;
+import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.parser.audioTagParser;
@@ -2012,25 +2013,50 @@ public final class Switchboard extends serverSwitch {
                 @Override
                 public void run() {
                     VocabularyScraper scraper = new VocabularyScraper();
-                    SolrInputDocument surrogate;
-                    while ((surrogate = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
-                        assert surrogate != null;
-                        try {
-                            // enrich the surrogate
-                            final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName())));
-                            final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
-                            if (text != null && text.length() > 0) {
-                                // run the tokenizer on the text to get vocabularies and synonyms
-                                final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
-                                final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
-                                // overwrite the given vocabularies and synonyms with new computed ones
-                                Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
-                            }
-                        } catch (MalformedURLException e) {
-                            ConcurrentLog.logException(e);
+                    Object surrogateObj;
+                    while ((surrogateObj = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
+                        assert surrogateObj != null;
+                        /* When parsing a full-text Solr xml data dump Surrogate reader produces SolrInputDocument instances */
+                        if(surrogateObj instanceof SolrInputDocument) {
+                        	SolrInputDocument surrogate = (SolrInputDocument)surrogateObj;
+                        	try {
+                        		// enrich the surrogate
+                        		final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName());
+                        		final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
+                        		if (text != null && text.length() > 0 && id != null ) {
+                            		final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
+                        			// run the tokenizer on the text to get vocabularies and synonyms
+                        			final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
+                        			final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
+                        			// overwrite the given vocabularies and synonyms with new computed ones
+                        			Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
+                        		}
+                        	} catch (MalformedURLException e) {
+                        		ConcurrentLog.logException(e);
+                        	}
+                        	// write the surrogate into the index
+                        	Switchboard.this.index.putDocument(surrogate);
+                        } else if(surrogateObj instanceof DCEntry) {
+                        	/* When parsing a MediaWiki dump Surrogate reader produces DCEntry instances */
+                            // create a queue entry
+                        	final DCEntry entry = (DCEntry)surrogateObj;
+                            final Document document = entry.document();
+                            final Request request =
+                                new Request(
+                                    ASCII.getBytes(peers.mySeed().hash),
+                                    entry.getIdentifier(true),
+                                    null,
+                                    "",
+                                    entry.getDate(),
+                                    crawler.defaultSurrogateProfile.handle(),
+                                    0,
+                                    crawler.defaultSurrogateProfile.timezoneOffset());
+                            final Response response = new Response(request, null, null, crawler.defaultSurrogateProfile, false, null);
+                            final IndexingQueueEntry queueEntry =
+                                new IndexingQueueEntry(response, new Document[] {document}, null);
+                
+                            indexingCondensementProcessor.enQueue(queueEntry);
                         }
-                        // write the surrogate into the index
-                        Switchboard.this.index.putDocument(surrogate);
                         if (shallTerminate()) break;
                     }
                 }