Merge branch 'master' of git@github.com:yacy/yacy_search_server.git

pull/122/head
Michael Peter Christen 8 years ago
commit 1d81b8f102

@ -99,5 +99,9 @@
<classpathentry kind="lib" path="lib/imageio-bmp-3.3.1.jar"/>
<classpathentry kind="lib" path="lib/jsonic-1.2.0.jar"/>
<classpathentry kind="lib" path="lib/langdetect.jar"/>
<classpathentry kind="lib" path="lib/jwat-common-1.0.4.jar"/>
<classpathentry kind="lib" path="lib/jwat-gzip-1.0.4.jar"/>
<classpathentry kind="lib" path="lib/jwat-archive-common-1.0.4.jar"/>
<classpathentry kind="lib" path="lib/jwat-warc-1.0.4.jar"/>
<classpathentry kind="output" path="gen"/>
</classpath>

@ -217,6 +217,10 @@
<pathelement location="${lib}/jsch-0.1.54.jar" />
<pathelement location="${lib}/json-simple-1.1.1.jar" />
<pathelement location="${lib}/jsoup-1.10.2.jar" />
<pathelement location="${lib}/jwat-common-1.0.4.jar" />
<pathelement location="${lib}/jwat-gzip-1.0.4.jar" />
<pathelement location="${lib}/jwat-archive-common-1.0.4.jar" />
<pathelement location="${lib}/jwat-warc-1.0.4.jar" />
<pathelement location="${lib}/log4j-over-slf4j-1.7.24.jar" />
<pathelement location="${lib}/lucene-analyzers-common-5.5.3.jar" />
<pathelement location="${lib}/lucene-analyzers-phonetic-5.5.3.jar" />

@ -103,7 +103,7 @@ function updatepage(str) {
<fieldset><legend>URL Metadata</legend>
<dl>
<dt>URL:</dt><dd><a href="#[url]#">#[url]#</a></dd>
<dt>Hash:</dt><dd><a href="solr/select?defType=edismax&start=0&rows=3&core=collection1&wt=html&q=id:%22#[hash]#%22">#[hash]#</a> (click this for full metadata)</dd>
<dt>Hash:</dt><dd>#(inurldb)##[hash]#::<a href="solr/select?defType=edismax&start=0&rows=3&core=collection1&wt=html&q=id:%22#[hash]#%22">#[hash]#</a> (click this for full metadata)#(/inurldb)#</dd>
<dt>In Metadata:</dt><dd>#(inurldb)#no::yes#(/inurldb)#</dd>
<dt>In Cache:</dt><dd>#(incache)#no::yes#(/incache)#</dd>
<dt>First Seen:</dt><dd>#[firstSeen]#</dd>

@ -356,6 +356,7 @@ public class ViewFile {
prop.put("error", "0");
prop.put("error_url", url.toNormalform(true));
prop.put("error_hash", urlHash);
prop.put("error_inurldb_hash", urlHash);
prop.put("error_wordCount", wordCount);
prop.put("error_firstSeen", "");
long firstseen = sb.index.getFirstSeenTime(ASCII.getBytes(urlHash));

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -476,6 +476,26 @@
<version>1.10.2</version>
<type>jar</type>
</dependency>
<dependency>
<groupId>org.jwat</groupId>
<artifactId>jwat-common</artifactId>
<version>1.0.4</version>
</dependency>
<dependency>
<groupId>org.jwat</groupId>
<artifactId>jwat-gzip</artifactId>
<version>1.0.4</version>
</dependency>
<dependency>
<groupId>org.jwat</groupId>
<artifactId>jwat-warc</artifactId>
<version>1.0.4</version>
</dependency>
<dependency>
<groupId>org.jwat</groupId>
<artifactId>jwat-archive-common</artifactId>
<version>1.0.4</version>
</dependency>
<dependency>
<groupId>com.youcruit.com.cybozu.labs</groupId>
<artifactId>langdetect</artifactId>

@ -33,6 +33,7 @@ import java.util.regex.Pattern;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.lod.vocabulary.DublinCore;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphSchema;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
@ -72,7 +73,7 @@ public class HTMLResponseWriter implements QueryResponseWriter {
/**
* Append YaCy JavaScript license information to writer
* @param writer must be non null
* @throws IOException when a write error occured
* @throws IOException when a write error occurred
*/
private void writeJSLicence(final Writer writer) throws IOException {
writer.write("<script>");
@ -151,7 +152,10 @@ public class HTMLResponseWriter implements QueryResponseWriter {
NamedList<Object> paramsList = request.getOriginalParams().toNamedList();
paramsList.remove("wt");
String xmlquery = dqp.matcher("../solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
final String coreName = request.getCore().getName();
String xmlquery = dqp.matcher("../solr/select?" + SolrParams.toSolrParams(paramsList).toString() + "&core=" + coreName).replaceAll("%22");
DocList response = ((ResultContext) values.get("response")).docs;
final int sz = response.size();
@ -164,24 +168,39 @@ public class HTMLResponseWriter implements QueryResponseWriter {
Document doc = searcher.doc(id);
LinkedHashMap<String, String> tdoc = translateDoc(schema, doc);
String title = doc.get(CollectionSchema.title.getSolrFieldName()); // title is multivalued, after translation fieldname could be in tdoc. "title_0" ..., so get it from doc
if (title == null) title = "";
if (sz == 1) {
writer.write("<title>" + title + "</title>\n</head><body>\n");
String title;
if(CollectionSchema.CORE_NAME.equals(coreName)) {
title = doc.get(CollectionSchema.title.getSolrFieldName()); // title is multivalued, after translation fieldname could be in tdoc. "title_0" ..., so get it from doc
if (title == null) title = "";
if (sz == 1) {
writer.write("<title>" + title + "</title>\n</head><body>\n");
} else {
writer.write("<title>Documents List</title>\n</head><body>\n");
}
} else if(WebgraphSchema.CORE_NAME.equals(coreName)) {
title = "";
writer.write("<title>Links list</title>\n</head><body>\n");
} else {
writer.write("<title>Document List</title>\n</head><body>\n");
title = "";
writer.write("<title>Solr documents List</title>\n</head><body>\n");
}
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</span></div>\n");
writeDoc(writer, tdoc, title);
writeDoc(writer, tdoc, title, coreName);
while (iterator.hasNext()) {
id = iterator.nextDoc();
doc = searcher.doc(id);
tdoc = translateDoc(schema, doc);
title = tdoc.get(CollectionSchema.title.getSolrFieldName());
writeDoc(writer, tdoc, title);
if(CollectionSchema.CORE_NAME.equals(coreName)) {
title = tdoc.get(CollectionSchema.title.getSolrFieldName());
if (title == null) title = "";
} else {
title = "";
}
writeDoc(writer, tdoc, title, coreName);
}
} else {
writer.write("<title>No Document Found</title>\n</head><body>\n");
@ -191,16 +210,21 @@ public class HTMLResponseWriter implements QueryResponseWriter {
writer.write("</body></html>\n");
}
private static final void writeDoc(Writer writer, LinkedHashMap<String, String> tdoc, String title) throws IOException {
private static final void writeDoc(final Writer writer, final LinkedHashMap<String, String> tdoc, final String title, final String coreName) throws IOException {
writer.write("<form name=\"yacydoc" + title + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
writer.write("<fieldset>\n");
// add a link to re-crawl this url (in case it is a remote metadata only entry)
String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName());
final String jsc= "javascript:w = window.open('../QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&followFrames=on&obeyHtmlRobotsNoindex=on&obeyHtmlRobotsNofollow=off&xdstopw=on&title=" + URLEncoder.encode(title, StandardCharsets.UTF_8.name()) + "&url='+escape('"+sku+"'),'_blank','height=250,width=600,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();";
writer.write("<div class='btn btn-default btn-sm' style='float:right' onclick=\""+jsc+"\">re-crawl url</div>\n");
if(CollectionSchema.CORE_NAME.equals(coreName)) {
String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName());
if(sku != null) {
final String jsc= "javascript:w = window.open('../QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&followFrames=on&obeyHtmlRobotsNoindex=on&obeyHtmlRobotsNofollow=off&xdstopw=on&title=" + URLEncoder.encode(title, StandardCharsets.UTF_8.name()) + "&url='+escape('"+sku+"'),'_blank','height=250,width=600,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();";
writer.write("<div class='btn btn-default btn-sm' style='float:right' onclick=\""+jsc+"\">re-crawl url</div>\n");
}
writer.write("<h1 property=\"" + DublinCore.Title.getURIref()+ "\">" + title + "</h1>\n");
}
writer.write("<h1 property=\"" + DublinCore.Title.getURIref()+ "\">" + title + "</h1>\n");
writer.write("<dl>\n");
for (Map.Entry<String, String> entry: tdoc.entrySet()) {
writer.write("<dt>");

@ -0,0 +1,138 @@
/**
* WarcImporter.java
* (C) 2017 by reger24; https://github.com/reger24
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* LICENSE
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
import java.io.IOException;
import java.io.InputStream;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.TextParser;
import net.yacy.search.Switchboard;
import net.yacy.server.http.ChunkedInputStream;
import org.jwat.common.HeaderLine;
import org.jwat.common.HttpHeader;
import org.jwat.warc.WarcConstants;
import org.jwat.warc.WarcReader;
import org.jwat.warc.WarcReaderFactory;
import org.jwat.warc.WarcRecord;
/**
* Web Archive file format reader to process the warc archive content (responses)
*
* Warc format specification ISO 28500
* https://archive.org/details/WARCISO28500Version1Latestdraft
* http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
*
* http://archive-access.sourceforge.net/warc/warc_file_format-0.9.html
* http://archive-access.sourceforge.net/warc/
*/
public class WarcImporter {
/**
* Reads a Warc file and adds all contained responses to the index.
* The reader automatically handles plain or gzip'd warc files
*
* @param f inputstream for the warc file
* @throws IOException
*/
public void indexWarcRecords(InputStream f) throws IOException {
byte[] content;
int cnt = 0;
WarcReader localwarcReader = WarcReaderFactory.getReader(f);
WarcRecord wrec = localwarcReader.getNextRecord();
while (wrec != null) {
HeaderLine hl = wrec.getHeader(WarcConstants.FN_WARC_TYPE);
if (hl != null && hl.value.equals(WarcConstants.RT_RESPONSE)) { // filter responses
hl = wrec.getHeader(WarcConstants.FN_WARC_TARGET_URI);
DigestURL location = new DigestURL(hl.value);
HttpHeader http = wrec.getHttpHeader();
if (http != null && http.statusCode == 200) { // process http response header OK (status 200)
if (TextParser.supportsMime(http.contentType) == null) { // check availability of parser
InputStream istream = wrec.getPayloadContent();
hl = http.getHeader(HeaderFramework.TRANSFER_ENCODING);
if (hl != null && hl.value.contains("chunked")) {
// because chunked stream.read doesn't read source fully, make sure all chunks are read
istream = new ChunkedInputStream(istream);
final ByteBuffer bbuffer = new ByteBuffer();
int c;
while ((c = istream.read()) >= 0) {
bbuffer.append(c);
}
content = bbuffer.getBytes();
} else {
content = new byte[(int) http.getPayloadLength()];
istream.read(content, 0, content.length);
}
istream.close();
RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(http.statusCode);
for (HeaderLine hx : http.getHeaderList()) { // include all original response headers for parser
responseHeader.put(hx.name, hx.value);
}
final Request request = new Request(
null,
location,
requestHeader.referer() == null ? null : requestHeader.referer().hash(),
"warc",
responseHeader.lastModified(),
Switchboard.getSwitchboard().crawler.defaultRemoteProfile.handle(), // use remote profile (to index text & media, without writing to cache
0,
Switchboard.getSwitchboard().crawler.defaultRemoteProfile.timezoneOffset());
final Response response = new Response(
request,
requestHeader,
responseHeader,
Switchboard.getSwitchboard().crawler.defaultRemoteProfile,
false,
content
);
Switchboard.getSwitchboard().toIndexer(response);
cnt++;
}
}
}
wrec = localwarcReader.getNextRecord();
}
localwarcReader.close();
ConcurrentLog.info("WarcImporter", "Indexed " + cnt + " documents");
}
}

@ -164,6 +164,7 @@ import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.importer.WarcImporter;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.html.Evaluation;
@ -2002,6 +2003,16 @@ public final class Switchboard extends serverSwitch {
if (zis != null) try {zis.close();} catch (final IOException e) {}
}
return moved;
} else if (s.endsWith(".warc") || s.endsWith(".warc.gz")) {
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
WarcImporter wri = new WarcImporter();
wri.indexWarcRecords(is);
moved = infile.renameTo(outfile);
} catch (IOException ex) {
log.warn("IO Error processing warc file " + infile);
}
return moved;
}
InputStream is = null;
try {
@ -2162,7 +2173,9 @@ public final class Switchboard extends serverSwitch {
if ( surrogate.endsWith(".xml")
|| surrogate.endsWith(".xml.gz")
|| surrogate.endsWith(".xml.zip") ) {
|| surrogate.endsWith(".xml.zip")
|| surrogate.endsWith(".warc")
|| surrogate.endsWith(".warc.gz") ) {
// read the surrogate file and store entry in index
if ( processSurrogate(surrogate) ) {
return true;

Loading…
Cancel
Save