modified the domain list export function:

- used the new superfast domain list generation from the domain statistics
- better interactive behavior

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5118 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 77ee0765a4
commit c97d0fcee7

@ -1,5 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
#(reload)#::<meta http-equiv="REFRESH" content="5; url=/IndexControlURLs_p.html">#(/reload)#
<head>
<title>YaCy '#[clientname]#': Index Control</title>
#%env/templates/metas.template%#
@ -76,13 +77,13 @@
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">Export Format</dt>
<dd>Only Domain:
<dd>Only Domain <i>(superfast)</i>:
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;
<input type="radio" name="format" value="dom-html" />HTML (domains as URLs, no title)<br />
Full URL List:
<input type="radio" name="format" value="dom-html" checked="checked" />HTML (domains as URLs, no title)<br />
Full URL List <i>(high IO)&nbsp;&nbsp;&nbsp;&nbsp;</i>:
<input type="radio" name="format" value="url-text" />Plain Text List (URLs only)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-html" />HTML (URLs with title)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-rss" checked="checked" />XML (RSS)
<input type="radio" name="format" value="url-rss" />XML (RSS)
<br />
</dd>
<dt class="TableCellLight"></dt>
@ -95,7 +96,7 @@
#(/lurlexport)#
#(lurlexportfinished)#::
<div class="commit">Finished export of #[urlcount]# URLs to file #[exportfile]#</div>::
<div class="commit">Finished export of #[urlcount]# URLs to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a></div>::
#(/lurlexportfinished)#
#(lurlexporterror)#::

@ -58,6 +58,7 @@ public class IndexControlURLs_p {
prop.put("statistics", 1);
prop.put("statistics_lines", 100);
prop.put("statisticslines", 0);
prop.put("reload", 0);
// show export messages
final indexRepositoryReference.Export export = sb.webIndex.exportURL();
@ -68,6 +69,7 @@ public class IndexControlURLs_p {
prop.put("lurlexporterror", 0);
prop.put("lurlexport_exportfile", export.file().toString());
prop.put("lurlexport_urlcount", export.count());
prop.put("reload", 1);
} else {
prop.put("lurlexport", 1);
prop.put("lurlexport_exportfile", sb.getRootPath() + "/DATA/EXPORT/" + serverDate.formatShortSecond());
@ -109,6 +111,7 @@ public class IndexControlURLs_p {
final int i = sb.removeAllUrlReferences(urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
prop.put("lurlexport", 0);
prop.put("reload", 0);
}
if (post.containsKey("urlhashdelete")) {
@ -122,6 +125,7 @@ public class IndexControlURLs_p {
prop.putHTML("result", "Removed URL " + urlstring);
}
prop.put("lurlexport", 0);
prop.put("reload", 0);
}
if (post.containsKey("urldelete")) {
@ -137,6 +141,7 @@ public class IndexControlURLs_p {
prop.putHTML("result", "Removed URL " + urlstring);
}
prop.put("lurlexport", 0);
prop.put("reload", 0);
}
if (post.containsKey("urlstringsearch")) {
@ -157,6 +162,7 @@ public class IndexControlURLs_p {
prop.put("urlhash", "");
}
prop.put("lurlexport", 0);
prop.put("reload", 0);
}
if (post.containsKey("urlhashsearch")) {
@ -169,6 +175,7 @@ public class IndexControlURLs_p {
prop.put("statistics", 0);
}
prop.put("lurlexport", 0);
prop.put("reload", 0);
}
// generate list
@ -199,6 +206,7 @@ public class IndexControlURLs_p {
prop.put("result", "No Entries for URL hash " + urlhash);
}
prop.put("lurlexport", 0);
prop.put("reload", 0);
}
if (post.containsKey("lurlexport")) {
@ -227,6 +235,7 @@ public class IndexControlURLs_p {
if ((running != null) && (running.failed() == null)) {
prop.put("lurlexport", 2);
}
prop.put("reload", 1);
}
if (post.containsKey("deletedomain")) {
@ -239,6 +248,7 @@ public class IndexControlURLs_p {
}
// trigger the loading of the table
post.put("statistics", "");
prop.put("reload", 0);
}
if (post.containsKey("statistics")) {
@ -266,6 +276,7 @@ public class IndexControlURLs_p {
prop.put("statisticslines_domains", cnt);
prop.put("statisticslines", 1);
prop.put("lurlexport", 0);
prop.put("reload", 0);
}
// insert constants

@ -37,18 +37,17 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.data.htmlTools;
import de.anomic.http.JakartaCommonsHttpClient;
import de.anomic.http.JakartaCommonsHttpResponse;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroCloneableIterator;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.kelondro.kelondroSplitTable;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
@ -402,7 +401,6 @@ public final class indexRepositoryReference {
private String failure;
private final int format;
private final boolean dom;
private final kelondroRowSet doms;
public Export(final File f, final String filter, final int format, boolean dom) {
// format: 0=text, 1=html, 2=rss/xml
@ -413,7 +411,6 @@ public final class indexRepositoryReference {
this.format = format;
this.dom = dom;
if ((dom) && (format == 2)) dom = false;
this.doms = new kelondroRowSet(new kelondroRow("String hash-6", kelondroBase64Order.enhancedCoder, 0), 0);
}
public void run() {
@ -433,26 +430,24 @@ public final class indexRepositoryReference {
pw.println("<link>http://yacy.net</link>");
}
final Iterator<indexURLReference> i = entries(true, null); // iterates indexURLEntry objects
indexURLReference entry;
indexURLReference.Components comp;
String url;
loop: while (i.hasNext()) {
entry = i.next();
comp = entry.comp();
url = comp.url().toNormalform(true, false);
if (!url.matches(filter)) continue;
if (dom) {
if (doms.has(entry.hash().substring(6).getBytes())) continue loop;
doms.add(entry.hash().substring(6).getBytes());
url = comp.url().getHost();
if (format == 0) {
pw.println(url);
}
if (format == 1) {
pw.println("<a href=\"http://" + url + "\">" + url + "</a><br>");
}
} else {
if (dom) {
TreeSet<String> set = domainNameCollector(-1);
for (String host: set) {
if (!host.matches(filter)) continue;
if (format == 0) pw.println(host);
if (format == 1) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
count++;
}
} else {
final Iterator<indexURLReference> i = entries(true, null); // iterates indexURLEntry objects
indexURLReference entry;
indexURLReference.Components comp;
String url;
while (i.hasNext()) {
entry = i.next();
comp = entry.comp();
url = comp.url().toNormalform(true, false);
if (!url.matches(filter)) continue;
if (format == 0) {
pw.println(url);
}
@ -469,8 +464,8 @@ public final class indexRepositoryReference {
pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
pw.println("</item>");
}
count++;
}
count++;
}
if (format == 1) {
pw.println("</body></html>");
@ -501,10 +496,7 @@ public final class indexRepositoryReference {
}
public Iterator<hostStat> statistics(int count) throws IOException {
// prevent too heavy IO.
if (statsDump != null && count <= statsDump.size()) return statsDump.iterator();
private HashMap<String, hashStat> domainSampleCollector() throws IOException {
HashMap<String, hashStat> map = new HashMap<String, hashStat>();
// first collect all domains and calculate statistics about it
kelondroCloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
@ -512,7 +504,7 @@ public final class indexRepositoryReference {
hashStat ds;
if (i != null) while (i.hasNext()) {
urlhash = new String(i.next());
hosthash = urlhash.substring(6,11);
hosthash = urlhash.substring(6, 11);
ds = map.get(hosthash);
if (ds == null) {
ds = new hashStat(urlhash);
@ -521,6 +513,35 @@ public final class indexRepositoryReference {
ds.count++;
}
}
return map;
}
public TreeSet<String> domainNameCollector(int count) throws IOException {
// collect hashes from all domains
HashMap<String, hashStat> map = domainSampleCollector();
// fetch urls from the database to determine the host in clear text
indexURLReference urlref;
if (count < 0 || count > map.size()) count = map.size();
statsDump = new ArrayList<hostStat>();
TreeSet<String> set = new TreeSet<String>();
for (hashStat hs: map.values()) {
if (hs == null) continue;
urlref = this.load(hs.urlhash, null, 0);
if (urlref == null || urlref.comp() == null || urlref.comp().url() == null || urlref.comp().url().getHost() == null) continue;
set.add(urlref.comp().url().getHost());
count--;
if (count == 0) break;
}
return set;
}
public Iterator<hostStat> statistics(int count) throws IOException {
// prevent too heavy IO.
if (statsDump != null && count <= statsDump.size()) return statsDump.iterator();
// collect hashes from all domains
HashMap<String, hashStat> map = domainSampleCollector();
// order elements by size
kelondroMScoreCluster<String> s = new kelondroMScoreCluster<String>();
@ -531,10 +552,11 @@ public final class indexRepositoryReference {
// fetch urls from the database to determine the host in clear text
Iterator<String> j = s.scores(false); // iterate urlhash-examples in reverse order (biggest first)
indexURLReference urlref;
String urlhash;
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
if (count < 0 || count > s.size()) count = s.size();
statsDump = new ArrayList<hostStat>();
while (j.hasNext() && count > 0) {
while (j.hasNext()) {
urlhash = j.next();
if (urlhash == null) continue;
urlref = this.load(urlhash, null, 0);
@ -542,6 +564,7 @@ public final class indexRepositoryReference {
if (statsDump == null) return new ArrayList<hostStat>().iterator(); // some other operation has destroyed the object
statsDump.add(new hostStat(urlref.comp().url().getHost(), urlhash.substring(6, 11), s.getScore(urlhash)));
count--;
if (count == 0) break;
}
// finally return an iterator for the result array
return (statsDump == null) ? new ArrayList<hostStat>().iterator() : statsDump.iterator();

Loading…
Cancel
Save