added some more logging to domain extraction

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2316 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 79af283f6c
commit c57b78722b

@ -940,6 +940,13 @@ public final class yacy {
Iterator eiter = pool.loadedURL.entries(true, false);
HashSet doms = new HashSet();
plasmaCrawlLURL.Entry entry;
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
System.out.println("java -Xms<megabytes>m -Xmx<megabytes>m -classpath classes yacy -domlist [ -format { text | html } ] [ <path to DATA folder> ]");
System.out.println("i.e.");
System.out.println("java -Xms900m -Xmx900m -classpath classes yacy -domlist");
int c = 0;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlLURL.Entry) eiter.next();
@ -948,12 +955,16 @@ public final class yacy {
// here an MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(c + " urls checked, " + doms.size() + " domains collected.");
}
// output file in HTML format
if (format.equals("html")) {
// output file in HTML format
File file = new File(root, targetName + ".html");
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
Iterator i = doms.iterator();
String key;
bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes());
@ -967,11 +978,13 @@ public final class yacy {
}
bos.write(("</body></html>").getBytes());
bos.close();
//output file in plain text but compressed with ZIP
} else if (format.equals("zip")) {
// output file in plain text but compressed with ZIP
ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
File file = new File(root, targetName + ".zip");
ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
bos.putNextEntry(zipEntry);
Iterator i = doms.iterator();
String key;
@ -981,10 +994,12 @@ public final class yacy {
bos.write(serverCore.crlf);
}
bos.close();
//output file in plain text but compressed with GZIP
} else if (format.equals("gzip")) {
// output file in plain text but compressed with GZIP
File file = new File(root, targetName + ".txt.gz");
GZIPOutputStream bos = new GZIPOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
Iterator i = doms.iterator();
String key;
while (i.hasNext()) {

Loading…
Cancel
Save