Merge pull request #32 from luccioman/master

Fix for MediaWiki import (mantis 625)
pull/34/head
Michael Peter Christen 9 years ago
commit b2fac989fd

@ -40,12 +40,6 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlStacker;
import net.yacy.search.schema.CollectionConfiguration;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
@ -56,6 +50,11 @@ import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlStacker;
import net.yacy.search.schema.CollectionConfiguration;
public class SurrogateReader extends DefaultHandler implements Runnable {
@ -76,13 +75,15 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
private boolean parsingValue;
private DCEntry dcEntry;
private String elementName;
private final BlockingQueue<SolrInputDocument> surrogates;
/** Surrogates are either SolrInputDocument or DCEntry instances*/
private final BlockingQueue<Object> surrogates;
private SAXParser saxParser;
private final InputSource inputSource;
private final PushbackInputStream inputStream;
private final CrawlStacker crawlStacker;
private final CollectionConfiguration configuration;
private final int concurrency;
private String charsetName = "UTF-8";
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
@ -112,9 +113,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
this.elementName = null;
this.surrogates = new ArrayBlockingQueue<>(queueSize);
Reader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
Reader reader = new BufferedReader(new InputStreamReader(stream, this.charsetName));
this.inputSource = new InputSource(reader);
this.inputSource.setEncoding("UTF-8");
this.inputSource.setEncoding(this.charsetName);
this.inputStream = stream;
try {
@ -130,7 +131,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
// test the syntax of the stream by reading parts of the beginning
try {
if (isSolrDump()) {
BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, "UTF-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, this.charsetName));
String line;
while ((line = br.readLine()) != null) {
if (!line.startsWith("<doc>")) continue;
@ -145,7 +146,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
DigestURL url = new DigestURL(u);
final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(url);
if ( urlRejectReason == null ) {
// convert DCEntry to SolrInputDocument
// convert SolrDocument to SolrInputDocument
this.surrogates.put(this.configuration.toSolrInputDocument(doc));
}
} catch (MalformedURLException e) {
@ -180,26 +181,33 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
}
}
/**
* @return true when inputStream is likely to contain a rich and full-text Solr xml data dump (see IndexExport_p.html)
*/
private boolean isSolrDump() {
try {
boolean res = false;
byte[] b = new byte[100];
this.inputStream.read(b);
int nbRead = -1;
try {
String s = UTF8.String(b);
nbRead = this.inputStream.read(b);
if(nbRead > 0) {
String s = new String(b, 0, nbRead, this.charsetName);
if ((s.contains("<response>") && s.contains("<result>")) || s.startsWith("<doc>")) {
this.inputStream.unread(b);
return true;
res = true;
}
} catch (IOException e) {
ConcurrentLog.logException(e);
this.inputStream.unread(b);
return false;
}
} catch (IOException e) {
ConcurrentLog.logException(e);
return false;
} finally {
if (nbRead > 0) {
try {
this.inputStream.unread(b, 0, nbRead);
} catch (IOException e2) {
ConcurrentLog.logException(e2);
}
}
}
return false;
return res;
}
@Override
@ -231,8 +239,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
// check if url is in accepted domain
final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(this.dcEntry.getIdentifier(true));
if ( urlRejectReason == null ) {
// convert DCEntry to SolrInputDocument
this.surrogates.put(this.configuration.toSolrInputDocument(this.dcEntry));
// DCEntry can not be converted to SolrInputDocument as DC schema has nothing to do with Solr collection schema
this.surrogates.put(this.dcEntry);
}
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
@ -286,7 +294,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
}
}
public SolrInputDocument take() {
public Object take() {
try {
return this.surrogates.take();
} catch (final InterruptedException e) {

@ -750,16 +750,21 @@ public class MediawikiImporter extends Thread implements Importer {
public static void main(final String[] s) {
if (s.length == 0) {
ConcurrentLog.info("WIKITRANSLATION", "usage:");
ConcurrentLog.info("WIKITRANSLATION", " -index <wikipedia-dump>");
ConcurrentLog.info("WIKITRANSLATION", " -read <start> <len> <idx-file>");
ConcurrentLog.info("WIKITRANSLATION", " -find <title> <wikipedia-dump>");
ConcurrentLog.info("WIKITRANSLATION", " -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
System.exit(0);
System.out.println("usage:");
System.out.println(" -index <wikipedia-dump>");
System.out.println(" -read <start> <len> <idx-file>");
System.out.println(" -find <title> <wikipedia-dump>");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
ConcurrentLog.shutdown();
return;
}
try {
// example:
// java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
// java -Xmx2000m -cp classes:lib/bzip2.jar
// de.anomic.tools.mediawikiIndex -convert
// DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2
// DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert") && s.length > 2) {
final File sourcefile = new File(s[1]);
@ -802,7 +807,9 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
System.exit(0);
} finally {
ConcurrentLog.shutdown();
}
}
}

@ -158,6 +158,7 @@ import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.Parser.Failure;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.parser.audioTagParser;
@ -2012,14 +2013,18 @@ public final class Switchboard extends serverSwitch {
@Override
public void run() {
VocabularyScraper scraper = new VocabularyScraper();
SolrInputDocument surrogate;
while ((surrogate = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
assert surrogate != null;
Object surrogateObj;
while ((surrogateObj = reader.take()) != SurrogateReader.POISON_DOCUMENT ) {
assert surrogateObj != null;
/* When parsing a full-text Solr xml data dump Surrogate reader produces SolrInputDocument instances */
if(surrogateObj instanceof SolrInputDocument) {
SolrInputDocument surrogate = (SolrInputDocument)surrogateObj;
try {
// enrich the surrogate
final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName())));
final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName());
final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
if (text != null && text.length() > 0) {
if (text != null && text.length() > 0 && id != null ) {
final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
// run the tokenizer on the text to get vocabularies and synonyms
final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper);
final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
@ -2031,6 +2036,27 @@ public final class Switchboard extends serverSwitch {
}
// write the surrogate into the index
Switchboard.this.index.putDocument(surrogate);
} else if(surrogateObj instanceof DCEntry) {
/* When parsing a MediaWiki dump Surrogate reader produces DCEntry instances */
// create a queue entry
final DCEntry entry = (DCEntry)surrogateObj;
final Document document = entry.document();
final Request request =
new Request(
ASCII.getBytes(peers.mySeed().hash),
entry.getIdentifier(true),
null,
"",
entry.getDate(),
crawler.defaultSurrogateProfile.handle(),
0,
crawler.defaultSurrogateProfile.timezoneOffset());
final Response response = new Response(request, null, null, crawler.defaultSurrogateProfile, false, null);
final IndexingQueueEntry queueEntry =
new IndexingQueueEntry(response, new Document[] {document}, null);
indexingCondensementProcessor.enQueue(queueEntry);
}
if (shallTerminate()) break;
}
}

Loading…
Cancel
Save