Merge pull request #32 from luccioman/master

Fix for MediaWiki import (mantis 625)
9 years ago · b2fac989fd
parent cdb8f3b10d ad5586f8f6
commit b2fac989fd
3 changed files with 146 additions and 105 deletions
--- a/source/net/yacy/document/content/SurrogateReader.java
+++ b/source/net/yacy/document/content/SurrogateReader.java
@ -40,12 +40,6 @@ import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;

-import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.crawler.CrawlStacker;
-import net.yacy.search.schema.CollectionConfiguration;
-
 import org.apache.solr.client.solrj.impl.XMLResponseParser;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
@ -56,6 +50,11 @@ import org.xml.sax.SAXException;
 import org.xml.sax.SAXParseException;
 import org.xml.sax.helpers.DefaultHandler;

+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.CrawlStacker;
+import net.yacy.search.schema.CollectionConfiguration;
+

 public class SurrogateReader extends DefaultHandler implements Runnable {

@ -76,13 +75,15 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
    private boolean parsingValue;
    private DCEntry dcEntry;
    private String elementName;
-    private final BlockingQueue<SolrInputDocument> surrogates;
+    /** Surrogates are either SolrInputDocument or DCEntry instances*/
+    private final BlockingQueue<Object> surrogates;
    private SAXParser saxParser;
    private final InputSource inputSource;
    private final PushbackInputStream inputStream;
    private final CrawlStacker crawlStacker;
    private final CollectionConfiguration configuration;
    private final int concurrency;
+    private String charsetName = "UTF-8";

    private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
    private static SAXParser getParser() throws SAXException {
@ -112,9 +113,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        this.elementName = null;
        this.surrogates = new ArrayBlockingQueue<>(queueSize);
        
-        Reader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+        Reader reader = new BufferedReader(new InputStreamReader(stream, this.charsetName));
        this.inputSource = new InputSource(reader);
-        this.inputSource.setEncoding("UTF-8");
+        this.inputSource.setEncoding(this.charsetName);
        this.inputStream = stream;
        
        try {
@ -130,7 +131,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        // test the syntax of the stream by reading parts of the beginning
        try {
            if (isSolrDump()) {
-                BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, "UTF-8"));
+                BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, this.charsetName));
                String line;
                while ((line = br.readLine()) != null) {
                    if (!line.startsWith("<doc>")) continue;
@ -145,7 +146,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
                                DigestURL url = new DigestURL(u);
                                final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(url);
                                if ( urlRejectReason == null ) {
-                                    // convert DCEntry to SolrInputDocument
+                                    // convert SolrDocument to SolrInputDocument
                                    this.surrogates.put(this.configuration.toSolrInputDocument(doc));
                                }
                            } catch (MalformedURLException e) {
@ -180,26 +181,33 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        }
    }
    
+    /**
+     * @return true when inputStream is likely to contain a rich and full-text Solr xml data dump (see IndexExport_p.html)
+     */
 	private boolean isSolrDump() {
-        try {
+		boolean res = false;
 		byte[] b = new byte[100];
-            this.inputStream.read(b);
+		int nbRead = -1;
 		try {
-                String s = UTF8.String(b);
+			nbRead = this.inputStream.read(b);
+			if(nbRead > 0) {
+				String s = new String(b, 0, nbRead, this.charsetName);
 				if ((s.contains("<response>") && s.contains("<result>")) || s.startsWith("<doc>")) {
-                    this.inputStream.unread(b);
-                    return true;
+					res = true;
 				}
-            } catch (IOException e) {
-                ConcurrentLog.logException(e);
-                this.inputStream.unread(b);
-                return false;
 			}
 		} catch (IOException e) {
 			ConcurrentLog.logException(e);
-            return false;
+		} finally {
+			if (nbRead > 0) {
+				try {
+					this.inputStream.unread(b, 0, nbRead);
+				} catch (IOException e2) {
+					ConcurrentLog.logException(e2);
+				}
+			}
 		}
-        return false;
+		return res;
 	}
    
    @Override
@ -231,8 +239,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
                // check if url is in accepted domain
                final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(this.dcEntry.getIdentifier(true));
                if ( urlRejectReason == null ) {
-                    // convert DCEntry to SolrInputDocument
-                    this.surrogates.put(this.configuration.toSolrInputDocument(this.dcEntry));
+                    // DCEntry can not be converted to SolrInputDocument as DC schema has nothing to do with Solr collection schema
+                    this.surrogates.put(this.dcEntry);
                }
            } catch (final InterruptedException e) {
                ConcurrentLog.logException(e);
@ -286,7 +294,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        }
    }

-    public SolrInputDocument take() {
+    public Object take() {
        try {
            return this.surrogates.take();
        } catch (final InterruptedException e) {
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@ -750,16 +750,21 @@ public class MediawikiImporter extends Thread implements Importer {

 	public static void main(final String[] s) {
 		if (s.length == 0) {
-            ConcurrentLog.info("WIKITRANSLATION", "usage:");
-            ConcurrentLog.info("WIKITRANSLATION", " -index <wikipedia-dump>");
-            ConcurrentLog.info("WIKITRANSLATION", " -read  <start> <len> <idx-file>");
-            ConcurrentLog.info("WIKITRANSLATION", " -find  <title> <wikipedia-dump>");
-            ConcurrentLog.info("WIKITRANSLATION", " -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
-            System.exit(0);
+			System.out.println("usage:");
+			System.out.println(" -index <wikipedia-dump>");
+			System.out.println(" -read  <start> <len> <idx-file>");
+			System.out.println(" -find  <title> <wikipedia-dump>");
+			System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
+			ConcurrentLog.shutdown();
+			return;
 		}

+		try {
 			// example:
-        // java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
+			// java -Xmx2000m -cp classes:lib/bzip2.jar
+			// de.anomic.tools.mediawikiIndex -convert
+			// DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2
+			// DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/

 			if (s[0].equals("-convert") && s.length > 2) {
 				final File sourcefile = new File(s[1]);
@ -802,7 +807,9 @@ public class MediawikiImporter extends Thread implements Importer {
 				}

 			}
-        System.exit(0);
+		} finally {
+			ConcurrentLog.shutdown();
+		}
 	}

 }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -158,6 +158,7 @@ import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.Parser.Failure;
 import net.yacy.document.Tokenizer;
+import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.parser.audioTagParser;
@ -2012,14 +2013,18 @@ public final class Switchboard extends serverSwitch {
                @Override
                public void run() {
                    VocabularyScraper scraper = new VocabularyScraper();
-                    SolrInputDocument surrogate;
-                    while ((surrogate = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
-                        assert surrogate != null;
+                    Object surrogateObj;
+                    while ((surrogateObj = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
+                        assert surrogateObj != null;
+                        /* When parsing a full-text Solr xml data dump Surrogate reader produces SolrInputDocument instances */
+                        if(surrogateObj instanceof SolrInputDocument) {
+                        	SolrInputDocument surrogate = (SolrInputDocument)surrogateObj;
                        	try {
                        		// enrich the surrogate
-                            final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName())));
+                        		final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName());
                        		final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
-                            if (text != null && text.length() > 0) {
+                        		if (text != null && text.length() > 0 && id != null ) {
+                            		final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
                        			// run the tokenizer on the text to get vocabularies and synonyms
                        			final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
                        			final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
@ -2031,6 +2036,27 @@ public final class Switchboard extends serverSwitch {
                        	}
                        	// write the surrogate into the index
                        	Switchboard.this.index.putDocument(surrogate);
+                        } else if(surrogateObj instanceof DCEntry) {
+                        	/* When parsing a MediaWiki dump Surrogate reader produces DCEntry instances */
+                            // create a queue entry
+                        	final DCEntry entry = (DCEntry)surrogateObj;
+                            final Document document = entry.document();
+                            final Request request =
+                                new Request(
+                                    ASCII.getBytes(peers.mySeed().hash),
+                                    entry.getIdentifier(true),
+                                    null,
+                                    "",
+                                    entry.getDate(),
+                                    crawler.defaultSurrogateProfile.handle(),
+                                    0,
+                                    crawler.defaultSurrogateProfile.timezoneOffset());
+                            final Response response = new Response(request, null, null, crawler.defaultSurrogateProfile, false, null);
+                            final IndexingQueueEntry queueEntry =
+                                new IndexingQueueEntry(response, new Document[] {document}, null);
+                
+                            indexingCondensementProcessor.enQueue(queueEntry);
+                        }
                        if (shallTerminate()) break;
                    }
                }