- removed command-line option to export urls

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4226 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent af10f729df
commit bf6952abe7

@ -32,6 +32,37 @@
#{/rows}#
#(/urlhashsimilar)#
#(lurlexportfinished)#::
<div class="commit">Finished export of #[urlcount]# URLs to file #[exportfile]#</div>::
#(/lurlexportfinished)#
#(lurlexporterror)#::
<div class="error">Export to file #[exportfile]# failed: #[exportfailmsg]#</div>::
#(/lurlexporterror)#
#(lurlexport)#::
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>Loaded URL Export</legend>
<dl>
<dt class="TableCellDark">Export File</dt>
<dd><input type="text" name="exportfile" value="#[exportfile]#" size="80" maxlength="250" />
</dd>
<dt class="TableCellDark">URL Filter</dt>
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">Export Format</dt>
<dd><input type="radio" name="format" value="rss" checked />XML (RSS)&nbsp;&nbsp;
<input type="radio" name="format" value="text" />Plain Text List (URLs only)
</dd>
<dt class="TableCellLight"></dt>
<dd><input type="submit" name="lurlexport" value="Export URLs" />
</dd>
</dl>
</fieldset>
</form>::
<div class="commit" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# URLs so far</div>::
#(/lurlexport)#
#(genUrlProfile)#
::No entry found for URL-hash #[urlhash]#
::<table>

@ -25,6 +25,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Iterator;
@ -34,6 +35,7 @@ import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRotateIterator;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacySeedDB;
@ -46,14 +48,42 @@ public class IndexControlURLs_p {
plasmaSwitchboard sb = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
prop.put("urlstring", "");
prop.put("urlhash", "");
prop.put("result", "");
prop.put("ucount", Integer.toString(sb.wordIndex.loadedURL.size()));
prop.put("otherHosts", "");
if (sb.wordIndex.loadedURL.export_running()) {
// there is currently a running export
prop.put("lurlexportfinished", 0);
prop.put("lurlexporterror", 0);
prop.put("lurlexport", 2);
prop.put("lurlexport_exportfile", sb.wordIndex.loadedURL.export_file().toString());
prop.put("lurlexport_urlcount", sb.wordIndex.loadedURL.export_count());
} else {
prop.put("lurlexport", 1);
prop.put("lurlexport_exportfile", sb.getRootPath() + "/DATA/EXPORT/" + serverDate.shortSecondTime());
prop.put("lurlexportfinished", 0);
prop.put("lurlexporterror", 0);
if (sb.wordIndex.loadedURL.export_failed() == null) {
// the export is finished, or there has not been a export
if (sb.wordIndex.loadedURL.export_count() > 0) {
// an export is finished
prop.put("lurlexportfinished", 1);
prop.put("lurlexportfinished_exportfile", sb.wordIndex.loadedURL.export_file().toString());
prop.put("lurlexportfinished_urlcount", sb.wordIndex.loadedURL.export_count());
}
} else {
// the export had errors
prop.put("lurlexporterror", 1);
prop.put("lurlexporterror_exportfile", sb.wordIndex.loadedURL.export_file().toString());
prop.put("lurlexporterror_exportfailmsg", sb.wordIndex.loadedURL.export_failed());
}
}
if (post == null || env == null) {
prop.put("urlstring", "");
prop.put("urlhash", "");
prop.put("result", "");
prop.put("ucount", Integer.toString(sb.wordIndex.loadedURL.size()));
prop.put("otherHosts", "");
return prop; // be save
return prop; // nothing to do
}
// default values
@ -68,12 +98,9 @@ public class IndexControlURLs_p {
prop.put("result", " ");
if (post.containsKey("urlhashdeleteall")) {
//try {
int i = sb.removeAllUrlReferences(urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
//} catch (IOException e) {
// prop.put("result", "Deleted nothing because the url-hash could not be resolved");
//}
int i = sb.removeAllUrlReferences(urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
prop.put("lurlexport", 0);
}
if (post.containsKey("urlhashdelete")) {
@ -86,6 +113,7 @@ public class IndexControlURLs_p {
sb.urlRemove(urlhash);
prop.putHTML("result", "Removed URL " + urlstring);
}
prop.put("lurlexport", 0);
}
if (post.containsKey("urldelete")) {
@ -100,6 +128,7 @@ public class IndexControlURLs_p {
sb.urlRemove(urlhash);
prop.putHTML("result", "Removed URL " + urlstring);
}
prop.put("lurlexport", 0);
}
if (post.containsKey("urlstringsearch")) {
@ -118,6 +147,7 @@ public class IndexControlURLs_p {
prop.putHTML("urlstring", "bad url: " + urlstring);
prop.put("urlhash", "");
}
prop.put("lurlexport", 0);
}
if (post.containsKey("urlhashsearch")) {
@ -128,6 +158,7 @@ public class IndexControlURLs_p {
prop.putHTML("urlstring", entry.comp().url().toNormalform(false, true));
prop.putAll(genUrlProfile(sb, entry, urlhash));
}
prop.put("lurlexport", 0);
}
// generate list
@ -156,6 +187,25 @@ public class IndexControlURLs_p {
} catch (IOException e) {
prop.put("result", "No Entries for URL hash " + urlhash);
}
prop.put("lurlexport", 0);
}
if (post.containsKey("lurlexport")) {
boolean rss = post.get("format", "text").equals("rss");
String s = post.get("exportfile", "");
if (s.indexOf('.') < 0) {
if (rss) s = s + ".xml"; else s = s + ".txt";
}
File f = new File(s);
f.getParentFile().mkdirs();
String filter = post.get("exportfilter", ".*");
boolean running = sb.wordIndex.loadedURL.export(f, filter, rss);
prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", sb.wordIndex.loadedURL.export_count());
if ((running) && (sb.wordIndex.loadedURL.export_failed() == null)) {
prop.put("lurlexport", 2);
}
}
// insert constants

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>
<rss version="2.0"
<rss version="2.0">
xmlns:yacyTopwords="http://www.yacy.net/yacy/topwords"
xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/"
xmlns:atom="http://www.w3.org/2005/Atom">

@ -52,8 +52,11 @@
package de.anomic.plasma;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Iterator;
@ -529,6 +532,115 @@ public final class plasmaCrawlLURL {
}
}
private exportc exportthread = null;
public boolean export(File f, String filter, boolean rss) {
if ((exportthread != null) && (exportthread.isAlive())) {
serverLog.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
return false;
}
this.exportthread = new exportc(f, filter, rss);
this.exportthread.start();
return (this.exportthread.isAlive());
}
public String export_failed() {
if (exportthread == null) return null;
return exportthread.failure;
}
public int export_count() {
if (exportthread == null) return 0;
return exportthread.count();
}
public boolean export_running() {
if (exportthread == null) return false;
return exportthread.isAlive();
}
public File export_file() {
if (exportthread == null) return null;
return exportthread.file();
}
public class exportc extends Thread {
File f;
String filter;
int count;
String failure;
boolean rss;
public exportc(File f, String filter, boolean rss) {
this.f = f;
this.filter = filter;
this.count = 0;
this.failure = null;
this.rss = rss;
}
public void run() {
try {
f.getParentFile().mkdirs();
PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(f)));
if (rss) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
pw.println("<rss version=\"2.0\">");
pw.println("<channel>");
pw.println("<title>YaCy Peer-to-Peer - Web-Search LURL Export</title>");
pw.println("<description></description>");
pw.println("<link>http://yacy.net</link>");
}
Iterator i = entries(true, null); // iterates indexURLEntry objects
indexURLEntry entry;
indexURLEntry.Components comp;
String url;
while (i.hasNext()) {
entry = (indexURLEntry) i.next();
comp = entry.comp();
url = comp.url().toNormalform(true, false);
if (!url.matches(filter)) continue;
if (rss) {
pw.println("<item>");
pw.println("<title>" + yacyURL.escape(comp.title()) + "</title>");
pw.println("<link>" + url + "</link>");
if (comp.author().length() > 0) pw.println("<author>" + comp.author() + "</author>");
if (comp.tags().length() > 0) pw.println("<description>" + comp.tags() + "</description>");
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
pw.println("</item>");
} else {
pw.println(url);
}
count++;
}
if (rss) {
pw.println("</channel>");
pw.println("</rss>");
}
pw.close();
} catch (IOException e) {
e.printStackTrace();
this.failure = e.getMessage();
}
// terminate process
}
public File file() {
return this.f;
}
public String failed() {
return this.failure;
}
public int count() {
return this.count;
}
}
public static void main(String[] args) {
// test-generation of url hashes for debugging
// one argument requires, will be treated as url

@ -890,69 +890,6 @@ public final class yacy {
}
}
private static void urllist(String homePath, String source, boolean html, String targetName) {
File root = new File(homePath);
try {
final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf", false);
File file = new File(root, targetName);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
if (source.equals("lurl")) {
Iterator eiter = sb.wordIndex.loadedURL.entries(true, null);
indexURLEntry entry;
while (eiter.hasNext()) {
entry = (indexURLEntry) eiter.next();
indexURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) {
if (html) {
bos.write(("<a href=\"" + comp.url().toNormalform(false, true) + "\">" + comp.title() + "</a><br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(comp.url().toNormalform(false, true).getBytes());
bos.write(serverCore.crlf);
}
}
}
}
if (source.equals("eurl")) {
Iterator eiter = sb.crawlQueues.errorURL.entries(true, null);
plasmaCrawlZURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlZURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + entry.anycause() + "<br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(serverCore.crlf);
}
}
}
}
if (source.equals("nurl")) {
Iterator eiter = sb.crawlQueues.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "<br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(serverCore.crlf);
}
}
}
}
bos.close();
sb.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static String[] shift(String[] args, int pos, int count) {
String[] newargs = new String[args.length - count];
System.arraycopy(args, 0, newargs, 0, pos);
@ -1166,24 +1103,6 @@ public final class yacy {
if (args.length == 2) applicationRoot= args[1];
String outfile = "domlist_" + source + "_" + System.currentTimeMillis();
domlist(applicationRoot, source, format, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urllist"))) {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
if ((args[2].equals("nurl")) ||
(args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
}
boolean html = false;
if (args.length >= 3 && args[1].toLowerCase().equals("-format")) {
if (args[2].equals("html")) html = true;
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];

Loading…
Cancel
Save