added export option to export the fulltext of the search index text only

pull/9/merge
Michael Peter Christen 10 years ago
parent fbeae20b3a
commit de8cfbe1d7

@ -39,19 +39,21 @@
<dt>Only Domain:</dt> <dt>Only Domain:</dt>
<dd><input type="radio" name="format" value="dom-text" /> Plain Text List (domains only)<br /> <dd><input type="radio" name="format" value="dom-text" /> Plain Text List (domains only)<br />
<input type="radio" name="format" value="dom-html" /> HTML (domains as URLs, no title)</dd> <input type="radio" name="format" value="dom-html" /> HTML (domains as URLs, no title)</dd>
<dt>Only Text:</dt>
<dd><input type="radio" name="format" value="text-text" /> Fulltext of Search Index Text</dd>
</dl> </dl>
</dd> </dd>
<dt>&nbsp;</dt> <dt>&nbsp;</dt>
<dd><input type="submit" name="lurlexport" value="Export URLs" class="btn btn-primary" style="width:240px;"/> <dd><input type="submit" name="lurlexport" value="Export" class="btn btn-primary" style="width:240px;"/>
</dd> </dd>
</dl> </dl>
</fieldset> </fieldset>
</form>:: </form>::
<div class="alert alert-info" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# URLs so far</div>:: <div class="alert alert-info" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# Documents so far</div>::
#(/lurlexport)# #(/lurlexport)#
#(lurlexportfinished)#:: #(lurlexportfinished)#::
<div class="alert alert-success">Finished export of #[urlcount]# URLs to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a><br/> <div class="alert alert-success">Finished export of #[urlcount]# Documents to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a><br/>
<em>Import this file by moving it to DATA/SURROGATES/in</em></div>:: <em>Import this file by moving it to DATA/SURROGATES/in</em></div>::
#(/lurlexportfinished)# #(/lurlexportfinished)#

@ -93,27 +93,28 @@ public class IndexExport_p {
if (post.containsKey("lurlexport")) { if (post.containsKey("lurlexport")) {
// parse format // parse format
int format = 0; Fulltext.ExportFormat format = Fulltext.ExportFormat.text;
final String fname = post.get("format", "url-text"); final String fname = post.get("format", "url-text");
final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain
if (fname.endsWith("text")) format = 0; final boolean text = fname.startsWith("text");
if (fname.endsWith("html")) format = 1; if (fname.endsWith("text")) format = Fulltext.ExportFormat.text;
if (fname.endsWith("rss")) format = 2; if (fname.endsWith("html")) format = Fulltext.ExportFormat.html;
if (fname.endsWith("solr")) format = 3; if (fname.endsWith("rss")) format = Fulltext.ExportFormat.rss;
if (fname.endsWith("solr")) format = Fulltext.ExportFormat.solr;
// extend export file name // extend export file name
String s = post.get("exportfile", ""); String s = post.get("exportfile", "");
if (s.indexOf('.',0) < 0) { if (s.indexOf('.',0) < 0) {
if (format == 0) s = s + ".txt"; if (format == Fulltext.ExportFormat.text) s = s + ".txt";
if (format == 1) s = s + ".html"; if (format == Fulltext.ExportFormat.html) s = s + ".html";
if (format == 2 ) s = s + "_rss.xml"; if (format == Fulltext.ExportFormat.rss ) s = s + "_rss.xml";
if (format == 3) s = s + "_full.xml"; if (format == Fulltext.ExportFormat.solr) s = s + "_full.xml";
} }
final File f = new File(s); final File f = new File(s);
f.getParentFile().mkdirs(); f.getParentFile().mkdirs();
final String filter = post.get("exportfilter", ".*"); final String filter = post.get("exportfilter", ".*");
final String query = post.get("exportquery", "*:*"); final String query = post.get("exportquery", "*:*");
final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom); final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom, text);
prop.put("lurlexport_exportfile", s); prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count()); prop.put("lurlexport_urlcount", running.count());

@ -843,14 +843,12 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
final int q = fileName.lastIndexOf('?'); final int q = fileName.lastIndexOf('?');
if (q < 0) { if (q < 0) {
return fileName.substring(p + 1).toLowerCase(); return fileName.substring(p + 1).toLowerCase();
} else {
// check last dot in query part
if (p > q) {
return ""; // TODO: last . after ? (file.ext?param=one.txt)
} else {
return fileName.substring(p + 1, q).toLowerCase();
}
} }
// check last dot in query part
if (p > q) {
return ""; // TODO: last . after ? (file.ext?param=one.txt)
}
return fileName.substring(p + 1, q).toLowerCase();
} }
public String getPath() { public String getPath() {

@ -618,13 +618,17 @@ public final class Fulltext {
} }
} }
public static enum ExportFormat {
text, html, rss, solr;
}
// export methods // export methods
public Export export(final File f, final String filter, final String query, final int format, final boolean dom) { public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) { if ((this.exportthread != null) && (this.exportthread.isAlive())) {
ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
return this.exportthread; return this.exportthread;
} }
this.exportthread = new Export(f, filter, query, format, dom); this.exportthread = new Export(f, filter, query, format, dom, text);
this.exportthread.start(); this.exportthread.start();
return this.exportthread; return this.exportthread;
} }
@ -638,10 +642,10 @@ public final class Fulltext {
private final Pattern pattern; private final Pattern pattern;
private int count; private int count;
private String failure, query; private String failure, query;
private final int format; private final ExportFormat format;
private final boolean dom; private final boolean dom, text;
private Export(final File f, final String filter, final String query, final int format, boolean dom) { private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
// format: 0=text, 1=html, 2=rss/xml // format: 0=text, 1=html, 2=rss/xml
this.f = f; this.f = f;
this.pattern = filter == null ? null : Pattern.compile(filter); this.pattern = filter == null ? null : Pattern.compile(filter);
@ -650,6 +654,7 @@ public final class Fulltext {
this.failure = null; this.failure = null;
this.format = format; this.format = format;
this.dom = dom; this.dom = dom;
this.text = text;
//if ((dom) && (format == 2)) dom = false; //if ((dom) && (format == 2)) dom = false;
} }
@ -658,13 +663,13 @@ public final class Fulltext {
try { try {
final File parentf = this.f.getParentFile(); final File parentf = this.f.getParentFile();
if (parentf != null) parentf.mkdirs(); if (parentf != null) parentf.mkdirs();
OutputStream os = new FileOutputStream(this.format == 3 ? new File(this.f.getAbsolutePath() + ".gz") : this.f); OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f);
if (this.format == 3) os = new GZIPOutputStream(os, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}}; if (this.format == ExportFormat.solr) os = new GZIPOutputStream(os, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}};
final PrintWriter pw = new PrintWriter(new BufferedOutputStream(os)); final PrintWriter pw = new PrintWriter(new BufferedOutputStream(os));
if (this.format == 1) { if (this.format == ExportFormat.html) {
pw.println("<html><head></head><body>"); pw.println("<html><head></head><body>");
} }
if (this.format == 2) { if (this.format == ExportFormat.rss) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>"); pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">"); pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
@ -673,7 +678,7 @@ public final class Fulltext {
pw.println("<description></description>"); pw.println("<description></description>");
pw.println("<link>http://yacy.net</link>"); pw.println("<link>http://yacy.net</link>");
} }
if (this.format == 3) { if (this.format == ExportFormat.solr) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<response>"); pw.println("<response>");
pw.println("<result>"); pw.println("<result>");
@ -683,12 +688,25 @@ public final class Fulltext {
ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
for (final String host: stats) { for (final String host: stats) {
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue; if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
if (this.format == 0) pw.println(host); if (this.format == ExportFormat.text) pw.println(host);
if (this.format == 1) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>"); if (this.format == ExportFormat.html) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
this.count++; this.count++;
} }
} else { } else {
if (this.format < 3) { if (this.format == ExportFormat.solr || (this.text && this.format == ExportFormat.text)) {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true);
SolrDocument doc;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
CRIgnoreWriter sw = new CRIgnoreWriter();
if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName())); else EnhancedXMLResponseWriter.writeDoc(sw, doc);
sw.close();
String d = sw.toString();
pw.println(d);
this.count++;
}
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
@ -705,13 +723,13 @@ public final class Fulltext {
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
if (this.format == 0) { if (this.format == ExportFormat.text) {
pw.println(url); pw.println(url);
} }
if (this.format == 1) { if (this.format == ExportFormat.html) {
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>"); if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
} }
if (this.format == 2) { if (this.format == ExportFormat.rss) {
pw.println("<item>"); pw.println("<item>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>"); if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>"); pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
@ -724,29 +742,16 @@ public final class Fulltext {
} }
this.count++; this.count++;
} }
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true);
SolrDocument doc;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
CRIgnoreWriter sw = new CRIgnoreWriter();
EnhancedXMLResponseWriter.writeDoc(sw, doc);
sw.close();
String d = sw.toString();
pw.println(d);
this.count++;
}
} }
} }
if (this.format == 1) { if (this.format == ExportFormat.html) {
pw.println("</body></html>"); pw.println("</body></html>");
} }
if (this.format == 2) { if (this.format == ExportFormat.rss) {
pw.println("</channel>"); pw.println("</channel>");
pw.println("</rss>"); pw.println("</rss>");
} }
if (this.format == 3) { if (this.format == ExportFormat.solr) {
pw.println("</result>"); pw.println("</result>");
pw.println("</response>"); pw.println("</response>");
} }

Loading…
Cancel
Save