Fixed MediaWiki import : DCEntry conversion to SolrInputDocument was

failing. Looks like it was broken since Commit b43811d38c
9 years ago · 8ebefa4233
parent 7736ee5a42
commit 8ebefa4233
2 changed files with 50 additions and 23 deletions
--- a/source/net/yacy/document/content/SurrogateReader.java
+++ b/source/net/yacy/document/content/SurrogateReader.java
@ -75,7 +75,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
    private boolean parsingValue;
    private DCEntry dcEntry;
    private String elementName;
-    private final BlockingQueue<SolrInputDocument> surrogates;
+    /** Surrogates are either SolrInputDocument or DCEntry instances*/
+    private final BlockingQueue<Object> surrogates;
    private SAXParser saxParser;
    private final InputSource inputSource;
    private final PushbackInputStream inputStream;
@ -145,7 +146,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
                                DigestURL url = new DigestURL(u);
                                final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(url);
                                if ( urlRejectReason == null ) {
-                                    // convert DCEntry to SolrInputDocument
+                                    // convert SolrDocument to SolrInputDocument
                                    this.surrogates.put(this.configuration.toSolrInputDocument(doc));
                                }
                            } catch (MalformedURLException e) {
@ -238,8 +239,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
                // check if url is in accepted domain
                final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(this.dcEntry.getIdentifier(true));
                if ( urlRejectReason == null ) {
-                    // convert DCEntry to SolrInputDocument
-                    this.surrogates.put(this.configuration.toSolrInputDocument(this.dcEntry));
+                    // DCEntry can not be converted to SolrInputDocument as DC schema has nothing to do with Solr collection schema
+                    this.surrogates.put(this.dcEntry);
                }
            } catch (final InterruptedException e) {
                ConcurrentLog.logException(e);
@ -293,7 +294,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        }
    }

-    public SolrInputDocument take() {
+    public Object take() {
        try {
            return this.surrogates.take();
        } catch (final InterruptedException e) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -158,6 +158,7 @@ import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.Parser.Failure;
 import net.yacy.document.Tokenizer;
+import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.parser.audioTagParser;
@ -2012,25 +2013,50 @@ public final class Switchboard extends serverSwitch {
                @Override
                public void run() {
                    VocabularyScraper scraper = new VocabularyScraper();
-                    SolrInputDocument surrogate;
-                    while ((surrogate = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
-                        assert surrogate != null;
-                        try {
-                            // enrich the surrogate
-                            final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName())));
-                            final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
-                            if (text != null && text.length() > 0) {
-                                // run the tokenizer on the text to get vocabularies and synonyms
-                                final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
-                                final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
-                                // overwrite the given vocabularies and synonyms with new computed ones
-                                Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
-                            }
-                        } catch (MalformedURLException e) {
-                            ConcurrentLog.logException(e);
+                    Object surrogateObj;
+                    while ((surrogateObj = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
+                        assert surrogateObj != null;
+                        /* When parsing a full-text Solr xml data dump Surrogate reader produces SolrInputDocument instances */
+                        if(surrogateObj instanceof SolrInputDocument) {
+                        	SolrInputDocument surrogate = (SolrInputDocument)surrogateObj;
+                        	try {
+                        		// enrich the surrogate
+                        		final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName());
+                        		final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
+                        		if (text != null && text.length() > 0 && id != null ) {
+                            		final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
+                        			// run the tokenizer on the text to get vocabularies and synonyms
+                        			final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
+                        			final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
+                        			// overwrite the given vocabularies and synonyms with new computed ones
+                        			Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
+                        		}
+                        	} catch (MalformedURLException e) {
+                        		ConcurrentLog.logException(e);
+                        	}
+                        	// write the surrogate into the index
+                        	Switchboard.this.index.putDocument(surrogate);
+                        } else if(surrogateObj instanceof DCEntry) {
+                        	/* When parsing a MediaWiki dump Surrogate reader produces DCEntry instances */
+                            // create a queue entry
+                        	final DCEntry entry = (DCEntry)surrogateObj;
+                            final Document document = entry.document();
+                            final Request request =
+                                new Request(
+                                    ASCII.getBytes(peers.mySeed().hash),
+                                    entry.getIdentifier(true),
+                                    null,
+                                    "",
+                                    entry.getDate(),
+                                    crawler.defaultSurrogateProfile.handle(),
+                                    0,
+                                    crawler.defaultSurrogateProfile.timezoneOffset());
+                            final Response response = new Response(request, null, null, crawler.defaultSurrogateProfile, false, null);
+                            final IndexingQueueEntry queueEntry =
+                                new IndexingQueueEntry(response, new Document[] {document}, null);
+                
+                            indexingCondensementProcessor.enQueue(queueEntry);
                        }
-                        // write the surrogate into the index
-                        Switchboard.this.index.putDocument(surrogate);
                        if (shallTerminate()) break;
                    }
                }