added zim importer rule for mdwiki

1 year ago · 3d3bdb0f5f
parent 4a611ac6a3
commit 3d3bdb0f5f
1 changed files with 59 additions and 57 deletions
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@ -108,63 +108,63 @@ public class ZimImporter extends Thread implements Importer {

            // read all documents
            for (int i = 0; i < this.file.header_entryCount; i++) {
-            	try {
-	                if (this.abort) break;
-	                DirectoryEntry de = this.reader.getDirectoryInfo(i);
-	                if (!(de instanceof ZIMReader.ArticleEntry)) continue;
-	                ArticleEntry ae = (ArticleEntry) de;
-	                if (ae.namespace != 'C' && ae.namespace != 'A') continue;
-	
-	                // check url
-	                DigestURL guessedUrl = guessURL(this.guessedSource, de);
-	                if (recordCnt < 10) {
-	                    // critical test for the first 10 urls
-	                    if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
-	                        sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
-	                        return; 
-	                    }
-	                }
-	
-	                // check availability of text parser
-	                String mimeType = ae.getMimeType();
-	                if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible
-	                if (TextParser.supportsMime(mimeType) != null) continue;
-	
-	                // read the content
-	                byte[] b = this.reader.getArticleData(ae);
-	
-	                // create artificial request and response headers for the indexer
-	                RequestHeader requestHeader = new RequestHeader();
-	                ResponseHeader responseHeader = new ResponseHeader(200);
-	                responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
-	                responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date
-	                final Request request = new Request(
-	                        ASCII.getBytes(sb.peers.mySeed().hash),
-	                        guessedUrl,
-	                        null, // referrerhash the hash of the referrer URL
-	                        de.title, // name the name of the document to crawl
-	                        null, // appdate the time when the url was first time appeared
-	                        sb.crawler.defaultSurrogateProfile.handle(),        // profileHandle the name of the prefetch profile. This must not be null!
-	                        0,    // depth the crawling depth of the entry
-	                        sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
-	                );
-	                final Response response = new Response(
-	                        request,
-	                        requestHeader,
-	                        responseHeader,
-	                        Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
-	                        false,
-	                        b
-	                );
-	
-	                // throw this to the indexer
-	                String error = sb.toIndexer(response);
-	                if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
-	                this.recordCnt++;
-            	} catch (Exception e) {
-            		// catch any error that could stop the importer
-	                ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage());
-            	}
+                try {
+                    if (this.abort) break;
+                    DirectoryEntry de = this.reader.getDirectoryInfo(i);
+                    if (!(de instanceof ZIMReader.ArticleEntry)) continue;
+                    ArticleEntry ae = (ArticleEntry) de;
+                    if (ae.namespace != 'C' && ae.namespace != 'A') continue;
+    
+                    // check url
+                    DigestURL guessedUrl = guessURL(this.guessedSource, de);
+                    if (recordCnt < 10) {
+                        // critical test for the first 10 urls
+                        if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
+                            sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
+                            return; 
+                        }
+                    }
+    
+                    // check availability of text parser
+                    String mimeType = ae.getMimeType();
+                    if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible
+                    if (TextParser.supportsMime(mimeType) != null) continue;
+    
+                    // read the content
+                    byte[] b = this.reader.getArticleData(ae);
+    
+                    // create artificial request and response headers for the indexer
+                    RequestHeader requestHeader = new RequestHeader();
+                    ResponseHeader responseHeader = new ResponseHeader(200);
+                    responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
+                    responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date
+                    final Request request = new Request(
+                            ASCII.getBytes(sb.peers.mySeed().hash),
+                            guessedUrl,
+                            null, // referrerhash the hash of the referrer URL
+                            de.title, // name the name of the document to crawl
+                            null, // appdate the time when the url was first time appeared
+                            sb.crawler.defaultSurrogateProfile.handle(),        // profileHandle the name of the prefetch profile. This must not be null!
+                            0,    // depth the crawling depth of the entry
+                            sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
+                    );
+                    final Response response = new Response(
+                            request,
+                            requestHeader,
+                            responseHeader,
+                            Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
+                            false,
+                            b
+                    );
+    
+                    // throw this to the indexer
+                    String error = sb.toIndexer(response);
+                    if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
+                    this.recordCnt++;
+                } catch (Exception e) {
+                    // catch any error that could stop the importer
+                    ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage());
+                }
            }
        } catch (IOException e) {
            ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());
@ -266,6 +266,8 @@ public class ZimImporter extends Thread implements Importer {
                return parts[1] + ".vikidia.org/wiki";
            case "westeros":
                return "westeros.org";
+            case "mdwiki":
+                return "mdwiki.org/wiki";
            case "wikihow":
                return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
            case "wikisource":