added zim importer rule for mdwiki

pull/621/head
Michael Peter Christen 1 year ago
parent 4a611ac6a3
commit 3d3bdb0f5f

@ -108,63 +108,63 @@ public class ZimImporter extends Thread implements Importer {
// read all documents // read all documents
for (int i = 0; i < this.file.header_entryCount; i++) { for (int i = 0; i < this.file.header_entryCount; i++) {
try { try {
if (this.abort) break; if (this.abort) break;
DirectoryEntry de = this.reader.getDirectoryInfo(i); DirectoryEntry de = this.reader.getDirectoryInfo(i);
if (!(de instanceof ZIMReader.ArticleEntry)) continue; if (!(de instanceof ZIMReader.ArticleEntry)) continue;
ArticleEntry ae = (ArticleEntry) de; ArticleEntry ae = (ArticleEntry) de;
if (ae.namespace != 'C' && ae.namespace != 'A') continue; if (ae.namespace != 'C' && ae.namespace != 'A') continue;
// check url // check url
DigestURL guessedUrl = guessURL(this.guessedSource, de); DigestURL guessedUrl = guessURL(this.guessedSource, de);
if (recordCnt < 10) { if (recordCnt < 10) {
// critical test for the first 10 urls // critical test for the first 10 urls
if (!guessedUrl.exists(ClientIdentification.browserAgent)) { if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl); sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
return; return;
} }
} }
// check availability of text parser // check availability of text parser
String mimeType = ae.getMimeType(); String mimeType = ae.getMimeType();
if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible
if (TextParser.supportsMime(mimeType) != null) continue; if (TextParser.supportsMime(mimeType) != null) continue;
// read the content // read the content
byte[] b = this.reader.getArticleData(ae); byte[] b = this.reader.getArticleData(ae);
// create artificial request and response headers for the indexer // create artificial request and response headers for the indexer
RequestHeader requestHeader = new RequestHeader(); RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(200); ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date
final Request request = new Request( final Request request = new Request(
ASCII.getBytes(sb.peers.mySeed().hash), ASCII.getBytes(sb.peers.mySeed().hash),
guessedUrl, guessedUrl,
null, // referrerhash the hash of the referrer URL null, // referrerhash the hash of the referrer URL
de.title, // name the name of the document to crawl de.title, // name the name of the document to crawl
null, // appdate the time when the url was first time appeared null, // appdate the time when the url was first time appeared
sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null! sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null!
0, // depth the crawling depth of the entry 0, // depth the crawling depth of the entry
sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
); );
final Response response = new Response( final Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
Switchboard.getSwitchboard().crawler.defaultSurrogateProfile, Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
false, false,
b b
); );
// throw this to the indexer // throw this to the indexer
String error = sb.toIndexer(response); String error = sb.toIndexer(response);
if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
this.recordCnt++; this.recordCnt++;
} catch (Exception e) { } catch (Exception e) {
// catch any error that could stop the importer // catch any error that could stop the importer
ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage()); ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage());
} }
} }
} catch (IOException e) { } catch (IOException e) {
ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage()); ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());
@ -266,6 +266,8 @@ public class ZimImporter extends Thread implements Importer {
return parts[1] + ".vikidia.org/wiki"; return parts[1] + ".vikidia.org/wiki";
case "westeros": case "westeros":
return "westeros.org"; return "westeros.org";
case "mdwiki":
return "mdwiki.org/wiki";
case "wikihow": case "wikihow":
return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com"; return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
case "wikisource": case "wikisource":

Loading…
Cancel
Save