added some more logging to domain extraction

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2316 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 79af283f6c
commit c57b78722b

@ -940,6 +940,13 @@ public final class yacy {
Iterator eiter = pool.loadedURL.entries(true, false); Iterator eiter = pool.loadedURL.entries(true, false);
HashSet doms = new HashSet(); HashSet doms = new HashSet();
plasmaCrawlLURL.Entry entry; plasmaCrawlLURL.Entry entry;
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
System.out.println("java -Xms<megabytes>m -Xmx<megabytes>m -classpath classes yacy -domlist [ -format { text | html } ] [ <path to DATA folder> ]");
System.out.println("i.e.");
System.out.println("java -Xms900m -Xmx900m -classpath classes yacy -domlist");
int c = 0;
while (eiter.hasNext()) { while (eiter.hasNext()) {
try { try {
entry = (plasmaCrawlLURL.Entry) eiter.next(); entry = (plasmaCrawlLURL.Entry) eiter.next();
@ -948,12 +955,16 @@ public final class yacy {
// here an MalformedURLException may occur // here an MalformedURLException may occur
// just ignore // just ignore
} }
c++;
if (c % 10000 == 0) System.out.println(c + " urls checked, " + doms.size() + " domains collected.");
} }
// output file in HTML format
if (format.equals("html")) { if (format.equals("html")) {
// output file in HTML format
File file = new File(root, targetName + ".html"); File file = new File(root, targetName + ".html");
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
Iterator i = doms.iterator(); Iterator i = doms.iterator();
String key; String key;
bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes()); bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes());
@ -967,11 +978,13 @@ public final class yacy {
} }
bos.write(("</body></html>").getBytes()); bos.write(("</body></html>").getBytes());
bos.close(); bos.close();
//output file in plain text but compressed with ZIP
} else if (format.equals("zip")) { } else if (format.equals("zip")) {
// output file in plain text but compressed with ZIP
ZipEntry zipEntry = new ZipEntry(targetName + ".txt"); ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
File file = new File(root, targetName + ".zip"); File file = new File(root, targetName + ".zip");
ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file)); ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
bos.putNextEntry(zipEntry); bos.putNextEntry(zipEntry);
Iterator i = doms.iterator(); Iterator i = doms.iterator();
String key; String key;
@ -981,10 +994,12 @@ public final class yacy {
bos.write(serverCore.crlf); bos.write(serverCore.crlf);
} }
bos.close(); bos.close();
//output file in plain text but compressed with GZIP
} else if (format.equals("gzip")) { } else if (format.equals("gzip")) {
// output file in plain text but compressed with GZIP
File file = new File(root, targetName + ".txt.gz"); File file = new File(root, targetName + ".txt.gz");
GZIPOutputStream bos = new GZIPOutputStream(new FileOutputStream(file)); GZIPOutputStream bos = new GZIPOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
Iterator i = doms.iterator(); Iterator i = doms.iterator();
String key; String key;
while (i.hasNext()) { while (i.hasNext()) {

Loading…
Cancel
Save