Merge branch 'master' of git@github.com:yacy/yacy_search_server.git

8 years ago · 1d81b8f102
parent 69081bce00 510f11d374
commit 1d81b8f102
12 changed files with 221 additions and 17 deletions
--- a/.classpath
+++ b/.classpath
@ -99,5 +99,9 @@
 	<classpathentry kind="lib" path="lib/imageio-bmp-3.3.1.jar"/>
 	<classpathentry kind="lib" path="lib/jsonic-1.2.0.jar"/>
 	<classpathentry kind="lib" path="lib/langdetect.jar"/>
+        <classpathentry kind="lib" path="lib/jwat-common-1.0.4.jar"/>
+        <classpathentry kind="lib" path="lib/jwat-gzip-1.0.4.jar"/>
+        <classpathentry kind="lib" path="lib/jwat-archive-common-1.0.4.jar"/>
+        <classpathentry kind="lib" path="lib/jwat-warc-1.0.4.jar"/>
 	<classpathentry kind="output" path="gen"/>
 </classpath>
--- a/build.xml
+++ b/build.xml
@ -217,6 +217,10 @@
    	<pathelement location="${lib}/jsch-0.1.54.jar" />
    	<pathelement location="${lib}/json-simple-1.1.1.jar" />
    	<pathelement location="${lib}/jsoup-1.10.2.jar" />
+        <pathelement location="${lib}/jwat-common-1.0.4.jar" />
+        <pathelement location="${lib}/jwat-gzip-1.0.4.jar" />
+        <pathelement location="${lib}/jwat-archive-common-1.0.4.jar" />
+        <pathelement location="${lib}/jwat-warc-1.0.4.jar" />
    	<pathelement location="${lib}/log4j-over-slf4j-1.7.24.jar" />
    	<pathelement location="${lib}/lucene-analyzers-common-5.5.3.jar" />
    	<pathelement location="${lib}/lucene-analyzers-phonetic-5.5.3.jar" />
--- a/htroot/ViewFile.html
+++ b/htroot/ViewFile.html
@ -103,7 +103,7 @@ function updatepage(str) {
      <fieldset><legend>URL Metadata</legend>
        <dl>
          <dt>URL:</dt><dd><a href="#[url]#">#[url]#</a></dd>
-          <dt>Hash:</dt><dd><a href="solr/select?defType=edismax&start=0&rows=3&core=collection1&wt=html&q=id:%22#[hash]#%22">#[hash]#</a> (click this for full metadata)</dd>
+          <dt>Hash:</dt><dd>#(inurldb)##[hash]#::<a href="solr/select?defType=edismax&start=0&rows=3&core=collection1&wt=html&q=id:%22#[hash]#%22">#[hash]#</a> (click this for full metadata)#(/inurldb)#</dd>
          <dt>In Metadata:</dt><dd>#(inurldb)#no::yes#(/inurldb)#</dd>
          <dt>In Cache:</dt><dd>#(incache)#no::yes#(/incache)#</dd>
          <dt>First Seen:</dt><dd>#[firstSeen]#</dd>
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -356,6 +356,7 @@ public class ViewFile {
        prop.put("error", "0");
        prop.put("error_url", url.toNormalform(true));
        prop.put("error_hash", urlHash);
+        prop.put("error_inurldb_hash", urlHash);
        prop.put("error_wordCount", wordCount);
        prop.put("error_firstSeen", "");
        long firstseen = sb.index.getFirstSeenTime(ASCII.getBytes(urlHash));
--- a/lib/jwat-archive-common-1.0.4.jar
+++ b/lib/jwat-archive-common-1.0.4.jar
--- a/lib/jwat-common-1.0.4.jar
+++ b/lib/jwat-common-1.0.4.jar
--- a/lib/jwat-gzip-1.0.4.jar
+++ b/lib/jwat-gzip-1.0.4.jar
--- a/lib/jwat-warc-1.0.4.jar
+++ b/lib/jwat-warc-1.0.4.jar
--- a/pom.xml
+++ b/pom.xml
@ -476,6 +476,26 @@
            <version>1.10.2</version>
            <type>jar</type>
        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-common</artifactId>
+            <version>1.0.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-gzip</artifactId>
+            <version>1.0.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-warc</artifactId>
+            <version>1.0.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-archive-common</artifactId>
+            <version>1.0.4</version>
+        </dependency>        
        <dependency>
            <groupId>com.youcruit.com.cybozu.labs</groupId>
            <artifactId>langdetect</artifactId>
--- a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java
+++ b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java
@ -33,6 +33,7 @@ import java.util.regex.Pattern;
 import net.yacy.cora.federate.solr.SolrType;
 import net.yacy.cora.lod.vocabulary.DublinCore;
 import net.yacy.search.schema.CollectionSchema;
+import net.yacy.search.schema.WebgraphSchema;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexableField;
@ -72,7 +73,7 @@ public class HTMLResponseWriter implements QueryResponseWriter {
    /**
     * Append YaCy JavaScript license information to writer
     * @param writer must be non null
-     * @throws IOException when a write error occured
+     * @throws IOException when a write error occurred
     */
 	private void writeJSLicence(final Writer writer) throws IOException {
 		writer.write("<script>");
@ -151,7 +152,10 @@ public class HTMLResponseWriter implements QueryResponseWriter {

        NamedList<Object> paramsList = request.getOriginalParams().toNamedList();
        paramsList.remove("wt");
-        String xmlquery = dqp.matcher("../solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
+        
+        final String coreName = request.getCore().getName();
+        
+        String xmlquery = dqp.matcher("../solr/select?" + SolrParams.toSolrParams(paramsList).toString() + "&core=" + coreName).replaceAll("%22");

        DocList response = ((ResultContext) values.get("response")).docs;
        final int sz = response.size();
@ -164,24 +168,39 @@ public class HTMLResponseWriter implements QueryResponseWriter {
            Document doc = searcher.doc(id);
            LinkedHashMap<String, String> tdoc = translateDoc(schema, doc);

-            String title = doc.get(CollectionSchema.title.getSolrFieldName()); // title is multivalued, after translation fieldname could be in tdoc. "title_0" ..., so get it from doc
-            if (title == null) title = "";
-            if (sz == 1) {
-                writer.write("<title>" + title + "</title>\n</head><body>\n");
+            
+            String title;
+            if(CollectionSchema.CORE_NAME.equals(coreName)) {
+            	title = doc.get(CollectionSchema.title.getSolrFieldName()); // title is multivalued, after translation fieldname could be in tdoc. "title_0" ..., so get it from doc
+            	if (title == null) title = "";
+            	if (sz == 1) {
+            		writer.write("<title>" + title + "</title>\n</head><body>\n");
+            	} else {
+            		writer.write("<title>Documents List</title>\n</head><body>\n");
+            	}
+            } else if(WebgraphSchema.CORE_NAME.equals(coreName)) {
+            	title = "";
+            	writer.write("<title>Links list</title>\n</head><body>\n");
            } else {
-                writer.write("<title>Document List</title>\n</head><body>\n");
+            	title = "";
+            	writer.write("<title>Solr documents List</title>\n</head><body>\n");
            }
            writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
            writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</span></div>\n");

-            writeDoc(writer, tdoc, title);
+            writeDoc(writer, tdoc, title, coreName);

            while (iterator.hasNext()) {
                id = iterator.nextDoc();
                doc = searcher.doc(id);
                tdoc = translateDoc(schema, doc);
-                title = tdoc.get(CollectionSchema.title.getSolrFieldName());
-                writeDoc(writer, tdoc, title);
+                if(CollectionSchema.CORE_NAME.equals(coreName)) {
+                	title = tdoc.get(CollectionSchema.title.getSolrFieldName());
+                    if (title == null) title = "";
+                } else {
+                	title = "";
+                }
+                writeDoc(writer, tdoc, title, coreName);
            }
        } else {
            writer.write("<title>No Document Found</title>\n</head><body>\n");
@ -191,16 +210,21 @@ public class HTMLResponseWriter implements QueryResponseWriter {
        writer.write("</body></html>\n");
    }

-    private static final void writeDoc(Writer writer, LinkedHashMap<String, String> tdoc, String title) throws IOException {
+    private static final void writeDoc(final Writer writer, final LinkedHashMap<String, String> tdoc, final String title, final String coreName) throws IOException {
        writer.write("<form name=\"yacydoc" + title + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
        writer.write("<fieldset>\n");
        
        // add a link to re-crawl this url (in case it is a remote metadata only entry)
-        String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName());
-        final String jsc= "javascript:w = window.open('../QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&followFrames=on&obeyHtmlRobotsNoindex=on&obeyHtmlRobotsNofollow=off&xdstopw=on&title=" + URLEncoder.encode(title, StandardCharsets.UTF_8.name()) + "&url='+escape('"+sku+"'),'_blank','height=250,width=600,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();";
-        writer.write("<div class='btn btn-default btn-sm' style='float:right' onclick=\""+jsc+"\">re-crawl url</div>\n");
+        if(CollectionSchema.CORE_NAME.equals(coreName)) {
+        	String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName());
+        	if(sku != null) {
+        		final String jsc= "javascript:w = window.open('../QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&followFrames=on&obeyHtmlRobotsNoindex=on&obeyHtmlRobotsNofollow=off&xdstopw=on&title=" + URLEncoder.encode(title, StandardCharsets.UTF_8.name()) + "&url='+escape('"+sku+"'),'_blank','height=250,width=600,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();";
+        		writer.write("<div class='btn btn-default btn-sm' style='float:right' onclick=\""+jsc+"\">re-crawl url</div>\n");
+        	}
+        	
+            writer.write("<h1 property=\"" + DublinCore.Title.getURIref()+ "\">" + title + "</h1>\n");
+        }

-        writer.write("<h1 property=\"" + DublinCore.Title.getURIref()+ "\">" + title + "</h1>\n");
        writer.write("<dl>\n");
        for (Map.Entry<String, String> entry: tdoc.entrySet()) {
            writer.write("<dt>");
--- a/source/net/yacy/document/importer/WarcImporter.java
+++ b/source/net/yacy/document/importer/WarcImporter.java
@ -0,0 +1,138 @@
+/**
+ * WarcImporter.java
+ * (C) 2017 by reger24; https://github.com/reger24
+ *
+ * This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ * LICENSE
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+package net.yacy.document.importer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.HeaderFramework;
+import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.protocol.ResponseHeader;
+import net.yacy.cora.util.ByteBuffer;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.retrieval.Request;
+import net.yacy.crawler.retrieval.Response;
+import net.yacy.document.TextParser;
+import net.yacy.search.Switchboard;
+import net.yacy.server.http.ChunkedInputStream;
+import org.jwat.common.HeaderLine;
+import org.jwat.common.HttpHeader;
+import org.jwat.warc.WarcConstants;
+import org.jwat.warc.WarcReader;
+import org.jwat.warc.WarcReaderFactory;
+import org.jwat.warc.WarcRecord;
+
+/**
+ * Web Archive file format reader to process the warc archive content (responses)
+ *
+ * Warc format specification ISO 28500
+ * https://archive.org/details/WARCISO28500Version1Latestdraft
+ * http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
+ *
+ * http://archive-access.sourceforge.net/warc/warc_file_format-0.9.html
+ * http://archive-access.sourceforge.net/warc/
+ */
+public class WarcImporter {
+
+    /**
+     * Reads a Warc file and adds all contained responses to the index.
+     * The reader automatically handles plain or gzip'd warc files
+     *
+     * @param f inputstream for the warc file
+     * @throws IOException
+     */
+    public void indexWarcRecords(InputStream f) throws IOException {
+
+        byte[] content;
+        int cnt = 0;
+
+        WarcReader localwarcReader = WarcReaderFactory.getReader(f);
+        WarcRecord wrec = localwarcReader.getNextRecord();
+        while (wrec != null) {
+
+            HeaderLine hl = wrec.getHeader(WarcConstants.FN_WARC_TYPE);
+            if (hl != null && hl.value.equals(WarcConstants.RT_RESPONSE)) { // filter responses
+
+                hl = wrec.getHeader(WarcConstants.FN_WARC_TARGET_URI);
+                DigestURL location = new DigestURL(hl.value);
+
+                HttpHeader http = wrec.getHttpHeader();
+
+                if (http != null && http.statusCode == 200) { // process http response header OK (status 200)
+
+                    if (TextParser.supportsMime(http.contentType) == null) { // check availability of parser
+
+                        InputStream istream = wrec.getPayloadContent();
+                        hl = http.getHeader(HeaderFramework.TRANSFER_ENCODING);
+                        if (hl != null && hl.value.contains("chunked")) {
+                            // because chunked stream.read doesn't read source fully, make sure all chunks are read
+                            istream = new ChunkedInputStream(istream);
+                            final ByteBuffer bbuffer = new ByteBuffer();
+                            int c;
+                            while ((c = istream.read()) >= 0) {
+                                bbuffer.append(c);
+                            }
+                            content = bbuffer.getBytes();
+                        } else {
+                            content = new byte[(int) http.getPayloadLength()];
+                            istream.read(content, 0, content.length);
+                        }
+                        istream.close();
+
+                        RequestHeader requestHeader = new RequestHeader();
+
+                        ResponseHeader responseHeader = new ResponseHeader(http.statusCode);
+                        for (HeaderLine hx : http.getHeaderList()) { // include all original response headers for parser
+                            responseHeader.put(hx.name, hx.value);
+                        }
+
+                        final Request request = new Request(
+                                null,
+                                location,
+                                requestHeader.referer() == null ? null : requestHeader.referer().hash(),
+                                "warc",
+                                responseHeader.lastModified(),
+                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile.handle(), // use remote profile (to index text & media, without writing to cache
+                                0,
+                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile.timezoneOffset());
+
+                        final Response response = new Response(
+                                request,
+                                requestHeader,
+                                responseHeader,
+                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile,
+                                false,
+                                content
+                        );
+
+                        Switchboard.getSwitchboard().toIndexer(response);
+                        cnt++;
+                    }
+                }
+            }
+            wrec = localwarcReader.getNextRecord();
+        }
+        localwarcReader.close();
+        ConcurrentLog.info("WarcImporter", "Indexed " + cnt + " documents");
+    }
+}
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -164,6 +164,7 @@ import net.yacy.document.Tokenizer;
 import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.OAIListFriendsLoader;
+import net.yacy.document.importer.WarcImporter;
 import net.yacy.document.parser.audioTagParser;
 import net.yacy.document.parser.pdfParser;
 import net.yacy.document.parser.html.Evaluation;
@ -2002,6 +2003,16 @@ public final class Switchboard extends serverSwitch {
                if (zis != null) try {zis.close();} catch (final IOException e) {}
            }
            return moved;
+        } else if (s.endsWith(".warc") || s.endsWith(".warc.gz")) {
+            try {
+                InputStream is = new BufferedInputStream(new FileInputStream(infile));
+                WarcImporter wri = new WarcImporter();
+                wri.indexWarcRecords(is);
+                moved = infile.renameTo(outfile);
+            } catch (IOException ex) {
+                log.warn("IO Error processing warc file " + infile);
+            }
+            return moved;
        }
        InputStream is = null;
        try {
@ -2162,7 +2173,9 @@ public final class Switchboard extends serverSwitch {

                    if ( surrogate.endsWith(".xml")
                        || surrogate.endsWith(".xml.gz")
-                        || surrogate.endsWith(".xml.zip") ) {
+                        || surrogate.endsWith(".xml.zip")
+                        || surrogate.endsWith(".warc")
+                        || surrogate.endsWith(".warc.gz") ) {
                        // read the surrogate file and store entry in index
                        if ( processSurrogate(surrogate) ) {
                            return true;