Implement surrogate import from Warc archives (as first option handle

warc = Web ARChive File Format. Warc files with extension .warc or compressed warc.gz can be placed in the DATA/surrogate/in and contained responses are imported to the index. The used library is stream based so we can easily extend it later to use and load warc's from the net.
8 years ago · 510f11d374
parent 5b5b9d5d96
commit 510f11d374
9 changed files with 180 additions and 1 deletions
--- a/.classpath
+++ b/.classpath
@ -99,5 +99,9 @@
 	<classpathentry kind="lib" path="lib/imageio-bmp-3.3.1.jar"/>
 	<classpathentry kind="lib" path="lib/jsonic-1.2.0.jar"/>
 	<classpathentry kind="lib" path="lib/langdetect.jar"/>
+        <classpathentry kind="lib" path="lib/jwat-common-1.0.4.jar"/>
+        <classpathentry kind="lib" path="lib/jwat-gzip-1.0.4.jar"/>
+        <classpathentry kind="lib" path="lib/jwat-archive-common-1.0.4.jar"/>
+        <classpathentry kind="lib" path="lib/jwat-warc-1.0.4.jar"/>
 	<classpathentry kind="output" path="gen"/>
 </classpath>
--- a/build.xml
+++ b/build.xml
@ -217,6 +217,10 @@
    	<pathelement location="${lib}/jsch-0.1.54.jar" />
    	<pathelement location="${lib}/json-simple-1.1.1.jar" />
    	<pathelement location="${lib}/jsoup-1.10.2.jar" />
+        <pathelement location="${lib}/jwat-common-1.0.4.jar" />
+        <pathelement location="${lib}/jwat-gzip-1.0.4.jar" />
+        <pathelement location="${lib}/jwat-archive-common-1.0.4.jar" />
+        <pathelement location="${lib}/jwat-warc-1.0.4.jar" />
    	<pathelement location="${lib}/log4j-over-slf4j-1.7.24.jar" />
    	<pathelement location="${lib}/lucene-analyzers-common-5.5.3.jar" />
    	<pathelement location="${lib}/lucene-analyzers-phonetic-5.5.3.jar" />
--- a/lib/jwat-archive-common-1.0.4.jar
+++ b/lib/jwat-archive-common-1.0.4.jar
--- a/lib/jwat-common-1.0.4.jar
+++ b/lib/jwat-common-1.0.4.jar
--- a/lib/jwat-gzip-1.0.4.jar
+++ b/lib/jwat-gzip-1.0.4.jar
--- a/lib/jwat-warc-1.0.4.jar
+++ b/lib/jwat-warc-1.0.4.jar
--- a/pom.xml
+++ b/pom.xml
@ -476,6 +476,26 @@
            <version>1.10.2</version>
            <type>jar</type>
        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-common</artifactId>
+            <version>1.0.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-gzip</artifactId>
+            <version>1.0.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-warc</artifactId>
+            <version>1.0.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-archive-common</artifactId>
+            <version>1.0.4</version>
+        </dependency>        
        <dependency>
            <groupId>com.youcruit.com.cybozu.labs</groupId>
            <artifactId>langdetect</artifactId>
--- a/source/net/yacy/document/importer/WarcImporter.java
+++ b/source/net/yacy/document/importer/WarcImporter.java
@ -0,0 +1,138 @@
+/**
+ * WarcImporter.java
+ * (C) 2017 by reger24; https://github.com/reger24
+ *
+ * This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ * LICENSE
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+package net.yacy.document.importer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.HeaderFramework;
+import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.protocol.ResponseHeader;
+import net.yacy.cora.util.ByteBuffer;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.retrieval.Request;
+import net.yacy.crawler.retrieval.Response;
+import net.yacy.document.TextParser;
+import net.yacy.search.Switchboard;
+import net.yacy.server.http.ChunkedInputStream;
+import org.jwat.common.HeaderLine;
+import org.jwat.common.HttpHeader;
+import org.jwat.warc.WarcConstants;
+import org.jwat.warc.WarcReader;
+import org.jwat.warc.WarcReaderFactory;
+import org.jwat.warc.WarcRecord;
+
+/**
+ * Web Archive file format reader to process the warc archive content (responses)
+ *
+ * Warc format specification ISO 28500
+ * https://archive.org/details/WARCISO28500Version1Latestdraft
+ * http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
+ *
+ * http://archive-access.sourceforge.net/warc/warc_file_format-0.9.html
+ * http://archive-access.sourceforge.net/warc/
+ */
+public class WarcImporter {
+
+    /**
+     * Reads a Warc file and adds all contained responses to the index.
+     * The reader automatically handles plain or gzip'd warc files
+     *
+     * @param f inputstream for the warc file
+     * @throws IOException
+     */
+    public void indexWarcRecords(InputStream f) throws IOException {
+
+        byte[] content;
+        int cnt = 0;
+
+        WarcReader localwarcReader = WarcReaderFactory.getReader(f);
+        WarcRecord wrec = localwarcReader.getNextRecord();
+        while (wrec != null) {
+
+            HeaderLine hl = wrec.getHeader(WarcConstants.FN_WARC_TYPE);
+            if (hl != null && hl.value.equals(WarcConstants.RT_RESPONSE)) { // filter responses
+
+                hl = wrec.getHeader(WarcConstants.FN_WARC_TARGET_URI);
+                DigestURL location = new DigestURL(hl.value);
+
+                HttpHeader http = wrec.getHttpHeader();
+
+                if (http != null && http.statusCode == 200) { // process http response header OK (status 200)
+
+                    if (TextParser.supportsMime(http.contentType) == null) { // check availability of parser
+
+                        InputStream istream = wrec.getPayloadContent();
+                        hl = http.getHeader(HeaderFramework.TRANSFER_ENCODING);
+                        if (hl != null && hl.value.contains("chunked")) {
+                            // because chunked stream.read doesn't read source fully, make sure all chunks are read
+                            istream = new ChunkedInputStream(istream);
+                            final ByteBuffer bbuffer = new ByteBuffer();
+                            int c;
+                            while ((c = istream.read()) >= 0) {
+                                bbuffer.append(c);
+                            }
+                            content = bbuffer.getBytes();
+                        } else {
+                            content = new byte[(int) http.getPayloadLength()];
+                            istream.read(content, 0, content.length);
+                        }
+                        istream.close();
+
+                        RequestHeader requestHeader = new RequestHeader();
+
+                        ResponseHeader responseHeader = new ResponseHeader(http.statusCode);
+                        for (HeaderLine hx : http.getHeaderList()) { // include all original response headers for parser
+                            responseHeader.put(hx.name, hx.value);
+                        }
+
+                        final Request request = new Request(
+                                null,
+                                location,
+                                requestHeader.referer() == null ? null : requestHeader.referer().hash(),
+                                "warc",
+                                responseHeader.lastModified(),
+                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile.handle(), // use remote profile (to index text & media, without writing to cache
+                                0,
+                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile.timezoneOffset());
+
+                        final Response response = new Response(
+                                request,
+                                requestHeader,
+                                responseHeader,
+                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile,
+                                false,
+                                content
+                        );
+
+                        Switchboard.getSwitchboard().toIndexer(response);
+                        cnt++;
+                    }
+                }
+            }
+            wrec = localwarcReader.getNextRecord();
+        }
+        localwarcReader.close();
+        ConcurrentLog.info("WarcImporter", "Indexed " + cnt + " documents");
+    }
+}
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -164,6 +164,7 @@ import net.yacy.document.Tokenizer;
 import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.OAIListFriendsLoader;
+import net.yacy.document.importer.WarcImporter;
 import net.yacy.document.parser.audioTagParser;
 import net.yacy.document.parser.pdfParser;
 import net.yacy.document.parser.html.Evaluation;
@ -2002,6 +2003,16 @@ public final class Switchboard extends serverSwitch {
                if (zis != null) try {zis.close();} catch (final IOException e) {}
            }
            return moved;
+        } else if (s.endsWith(".warc") || s.endsWith(".warc.gz")) {
+            try {
+                InputStream is = new BufferedInputStream(new FileInputStream(infile));
+                WarcImporter wri = new WarcImporter();
+                wri.indexWarcRecords(is);
+                moved = infile.renameTo(outfile);
+            } catch (IOException ex) {
+                log.warn("IO Error processing warc file " + infile);
+            }
+            return moved;
        }
        InputStream is = null;
        try {
@ -2162,7 +2173,9 @@ public final class Switchboard extends serverSwitch {

                    if ( surrogate.endsWith(".xml")
                        || surrogate.endsWith(".xml.gz")
-                        || surrogate.endsWith(".xml.zip") ) {
+                        || surrogate.endsWith(".xml.zip")
+                        || surrogate.endsWith(".warc")
+                        || surrogate.endsWith(".warc.gz") ) {
                        // read the surrogate file and store entry in index
                        if ( processSurrogate(surrogate) ) {
                            return true;