added domain list extraction and html export format

to URL administration menu http://localhost:8080/IndexControlURLs_p.html

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4228 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent d8d77fc4b2
commit 445c0b5333

@ -17,7 +17,7 @@
<input type="submit" name="urlstringsearch" value="Show Details for URL" />
</dd>
<dt class="TableCellDark">Rertieve by URL-Hash:</dt>
<dt class="TableCellDark">Retrieve by URL-Hash:</dt>
<dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
<input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" />
<input type="submit" name="urlhashsimilar" value="Generate List" />
@ -51,8 +51,14 @@
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">Export Format</dt>
<dd><input type="radio" name="format" value="rss" checked />XML (RSS)&nbsp;&nbsp;
<input type="radio" name="format" value="text" />Plain Text List (URLs only)
<dd>Only Domain:
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;
<input type="radio" name="format" value="dom-html" />HTML (domains as URLs, no title)<br>
Full URL List:
<input type="radio" name="format" value="url-text" />Plain Text List (URLs only)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-html" />HTML (URLs with title)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-rss" checked />XML (RSS)
</br>
</dd>
<dt class="TableCellLight"></dt>
<dd><input type="submit" name="lurlexport" value="Export URLs" />

@ -191,15 +191,25 @@ public class IndexControlURLs_p {
}
if (post.containsKey("lurlexport")) {
boolean rss = post.get("format", "text").equals("rss");
// parse format
int format = 0;
String fname = post.get("format", "url-text");
boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, othervise only the domain
if (fname.endsWith("text")) format = 0;
if (fname.endsWith("html")) format = 1;
if (fname.endsWith("rss")) format = 2;
// extend export file name
String s = post.get("exportfile", "");
if (s.indexOf('.') < 0) {
if (rss) s = s + ".xml"; else s = s + ".txt";
if (format == 0) s = s + ".txt";
if (format == 1) s = s + ".html";
if (format == 2) s = s + ".xml";
}
File f = new File(s);
f.getParentFile().mkdirs();
String filter = post.get("exportfilter", ".*");
boolean running = sb.wordIndex.loadedURL.export(f, filter, rss);
boolean running = sb.wordIndex.loadedURL.export(f, filter, format, dom);
prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", sb.wordIndex.loadedURL.export_count());

@ -267,7 +267,7 @@ public class yacysearch {
"",
20,
constraint,
false);
true);
serverProfiling localTiming = new serverProfiling(4 * theQuery.maximumTime / 10, theQuery.displayResults());
String client = (String) header.get("CLIENTIP"); // the search client who initiated the search

@ -66,12 +66,14 @@ import de.anomic.http.httpc;
import de.anomic.http.httpc.response;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroCloneableIterator;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings;
import de.anomic.server.logging.serverLog;
@ -534,12 +536,12 @@ public final class plasmaCrawlLURL {
private exportc exportthread = null;
public boolean export(File f, String filter, boolean rss) {
public boolean export(File f, String filter, int format, boolean dom) {
if ((exportthread != null) && (exportthread.isAlive())) {
serverLog.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
return false;
}
this.exportthread = new exportc(f, filter, rss);
this.exportthread = new exportc(f, filter, format, dom);
this.exportthread.start();
return (this.exportthread.isAlive());
}
@ -569,21 +571,30 @@ public final class plasmaCrawlLURL {
String filter;
int count;
String failure;
boolean rss;
int format;
boolean dom;
kelondroRowSet doms;
public exportc(File f, String filter, boolean rss) {
public exportc(File f, String filter, int format, boolean dom) {
// format: 0=text, 1=html, 2=rss/xml
this.f = f;
this.filter = filter;
this.count = 0;
this.failure = null;
this.rss = rss;
this.format = format;
this.dom = dom;
if ((dom) && (format == 2)) dom = false;
this.doms = new kelondroRowSet(new kelondroRow("String hash-6", kelondroBase64Order.enhancedCoder, 0), 0);
}
public void run() {
try {
f.getParentFile().mkdirs();
PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(f)));
if (rss) {
if (format == 1) {
pw.println("<html><head></head><body>");
}
if (format == 2) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
pw.println("<rss version=\"2.0\">");
@ -597,26 +608,45 @@ public final class plasmaCrawlLURL {
indexURLEntry entry;
indexURLEntry.Components comp;
String url;
while (i.hasNext()) {
loop: while (i.hasNext()) {
entry = (indexURLEntry) i.next();
comp = entry.comp();
url = comp.url().toNormalform(true, false);
if (!url.matches(filter)) continue;
if (rss) {
pw.println("<item>");
pw.println("<title>" + yacyURL.escape(comp.title()) + "</title>");
pw.println("<link>" + url + "</link>");
if (comp.author().length() > 0) pw.println("<author>" + comp.author() + "</author>");
if (comp.tags().length() > 0) pw.println("<description>" + comp.tags() + "</description>");
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
pw.println("</item>");
if (dom) {
if (doms.has(entry.hash().substring(6).getBytes())) continue loop;
doms.add(entry.hash().substring(6).getBytes());
url = comp.url().getHost();
if (format == 0) {
pw.println(url);
}
if (format == 1) {
pw.println("<a href=\"http://" + url + "\">" + url + "</a><br>");
}
} else {
pw.println(url);
if (format == 0) {
pw.println(url);
}
if (format == 1) {
pw.println("<a href=\"" + url + "\">" + comp.title() + "</a><br>");
}
if (format == 2) {
pw.println("<item>");
pw.println("<title>" + comp.title() + "</title>");
pw.println("<link>" + yacyURL.escape(url) + "</link>");
if (comp.author().length() > 0) pw.println("<author>" + comp.author() + "</author>");
if (comp.tags().length() > 0) pw.println("<description>" + comp.tags() + "</description>");
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
pw.println("</item>");
}
}
count++;
count++;
}
if (rss) {
if (format == 1) {
pw.println("</body></html>");
}
if (format == 2) {
pw.println("</channel>");
pw.println("</rss>");
}

@ -75,10 +75,7 @@ import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMapObjects;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverCore;
@ -766,129 +763,6 @@ public final class yacy {
serverLog.logInfo("TRANSFER-CR", "could not read file " + crfile);
}
}
/**
* Generates a text file containing all domains in this peer's DB.
* This may be useful to calculate the YaCy-Blockrank.
*
* @param format String which determines the format of the file. Possible values: "html", "zip", "gzip" or "plain"
* @see urllist
*/
private static void domlist(String homePath, String source, String format, String targetName) {
File root = new File(homePath);
try {
final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf", false);
HashMap doms = new HashMap();
System.out.println("Started domain list extraction from " + sb.wordIndex.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
System.out.println("java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ <path to DATA folder> ]");
int c = 0;
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
Iterator eiter = sb.wordIndex.loadedURL.entries(true, null);
indexURLEntry entry;
while (eiter.hasNext()) {
try {
entry = (indexURLEntry) eiter.next();
indexURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("eurl")) {
Iterator eiter = sb.crawlQueues.errorURL.entries(true, null);
plasmaCrawlZURL.Entry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlZURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.anycause());
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("nurl")) {
Iterator eiter = sb.crawlQueues.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth());
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (format.equals("html")) {
// output file in HTML format
File file = new File(root, targetName + ".html");
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
String key;
bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes());
bos.write(serverCore.crlf);
bos.write(("<html><head><title>YaCy " + source + " domainlist</title></head><body>").getBytes());
bos.write(serverCore.crlf);
while (i.hasNext()) {
entry = (Map.Entry) i.next();
key = (String) entry.getKey();
bos.write(("<a href=\"http://" + key + "\">" + key + "</a>" +
((entry.getValue() == null) ? "" : (" " + ((String) entry.getValue()))) + "<br>"
).getBytes());
bos.write(serverCore.crlf);
}
bos.write(("</body></html>").getBytes());
bos.close();
} else if (format.equals("zip")) {
// output file in plain text but compressed with ZIP
File file = new File(root, targetName + ".zip");
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, "zip", doms.keySet(), new String(serverCore.crlf));
} else if (format.equals("gzip")) {
// output file in plain text but compressed with GZIP
File file = new File(root, targetName + ".txt.gz");
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, "gzip", doms.keySet(), new String(serverCore.crlf));
} else {
// plain text list
File file = new File(root, targetName + ".txt");
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf));
}
sb.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static String[] shift(String[] args, int pos, int count) {
String[] newargs = new String[args.length - count];
@ -1082,27 +956,6 @@ public final class yacy {
String targetaddress = args[1];
String crfile = args[2];
transferCR(targetaddress, crfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-domlist"))) {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
if ((args[2].equals("nurl")) ||
(args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
}
String format = "txt";
if (args.length >= 3 && args[1].toLowerCase().equals("-format")) {
if ((args[2].equals("html")) ||
(args[2].equals("zip")) ||
(args[2].equals("gzip")))
format = args[2];
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
String outfile = "domlist_" + source + "_" + System.currentTimeMillis();
domlist(applicationRoot, source, format, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];

Loading…
Cancel
Save