added option to extract domains and/or urls from the eurl database

when extracting from eurl, the html output format is recommended, since
this format adds also the fail reason to the domain/url.
The complete syntax for domain extraction is now
java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { lurl | eurl } ] [ -format { text  | zip | gzip | html } ] [ <path to DATA folder> ]


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2322 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 7e0a130fb5
commit 1ed3e2daef

@ -46,8 +46,8 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import de.anomic.net.URL;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Iterator;
@ -115,7 +115,7 @@ public class plasmaCrawlEURL extends indexURL {
}
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
return new Entry(hash);
}
public boolean exists(String urlHash) {
@ -154,21 +154,21 @@ public class plasmaCrawlEURL extends indexURL {
private String failreason; // string describing reason for load fail
private bitfield flags; // extra space
public Entry(URL url, String referrer, String initiator, String executor, String name, String failreason, bitfield flags) {
// create new entry and store it into database
this.hash = urlHash(url);
this.referrer = (referrer == null) ? dummyHash : referrer;
this.initiator = initiator;
this.executor = executor;
this.url = url;
this.name = name;
this.initdate = new Date();
this.trydate = new Date();
this.trycount = 0;
this.failreason = failreason;
this.flags = flags;
}
public Entry(URL url, String referrer, String initiator,
String executor, String name, String failreason, bitfield flags) {
// create new entry and store it into database
this.hash = urlHash(url);
this.referrer = (referrer == null) ? dummyHash : referrer;
this.initiator = initiator;
this.executor = executor;
this.url = url;
this.name = name;
this.initdate = new Date();
this.trydate = new Date();
this.trycount = 0;
this.failreason = failreason;
this.flags = flags;
}
public Entry(String hash) throws IOException {
// generates an plasmaEURLEntry using the url hash
@ -181,30 +181,40 @@ public class plasmaCrawlEURL extends indexURL {
this.hash = hash;
kelondroRow.Entry entry = urlHashCache.get(hash.getBytes());
if (entry != null) {
this.referrer = entry.getColString(1, "UTF-8");
this.initiator = entry.getColString(2, "UTF-8");
this.executor = entry.getColString(3, "UTF-8");
this.url = new URL(entry.getColString(4, "UTF-8").trim());
this.name = entry.getColString(5, "UTF-8").trim();
this.initdate = new Date(86400000 * entry.getColLongB64E(6));
this.trydate = new Date(86400000 * entry.getColLongB64E(7));
this.trycount = (int) entry.getColLongB64E(8);
this.failreason = entry.getColString(9, "UTF-8");
this.flags = new bitfield(entry.getColBytes(10));
return;
insertEntry(entry);
}
}
public Entry(kelondroRow.Entry entry) throws IOException {
insertEntry(entry);
}
private void store() {
// stores the values from the object variables into the database
private void insertEntry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
this.hash = entry.getColString(0, null);
this.referrer = entry.getColString(1, "UTF-8");
this.initiator = entry.getColString(2, "UTF-8");
this.executor = entry.getColString(3, "UTF-8");
this.url = new URL(entry.getColString(4, "UTF-8").trim());
this.name = entry.getColString(5, "UTF-8").trim();
this.initdate = new Date(86400000 * entry.getColLongB64E(6));
this.trydate = new Date(86400000 * entry.getColLongB64E(7));
this.trycount = (int) entry.getColLongB64E(8);
this.failreason = entry.getColString(9, "UTF-8");
this.flags = new bitfield(entry.getColBytes(10));
return;
}
private void store() {
// stores the values from the object variables into the database
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength);
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength);
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
byte[][] entry = new byte[][] {
this.hash.getBytes(),
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
byte[][] entry = new byte[][] {
this.hash.getBytes(),
this.referrer.getBytes(),
this.initiator.getBytes(),
this.executor.getBytes(),
@ -215,12 +225,12 @@ public class plasmaCrawlEURL extends indexURL {
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, urlRetryLength).getBytes(),
this.failreason.getBytes(),
this.flags.getBytes()
};
urlHashCache.put(urlHashCache.row().newEntry(entry));
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
};
urlHashCache.put(urlHashCache.row().newEntry(entry));
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
}
}
}
public String hash() {
// return a url-hash, based on the md5 algorithm
@ -267,27 +277,39 @@ public class plasmaCrawlEURL extends indexURL {
}
public class kenum implements Enumeration {
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
public kenum(boolean up, boolean rotating) throws IOException {
i = urlHashCache.rows(up, rotating, null);
boolean error = false;
public kiter(boolean up, boolean rotating, String firstHash) throws IOException {
i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
public boolean hasMoreElements() {
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object nextElement() {
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(new String(((byte[][]) i.next())[0]));
} catch (IOException e) {
return null;
return new Entry(e);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
public void remove() {
i.remove();
}
}
public Enumeration elements(boolean up, boolean rotating) throws IOException {
// enumerates entry elements
return new kenum(up, rotating);
public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, rotating, firstHash);
}
}

@ -188,7 +188,7 @@ public class plasmaRankingRCIEvaluation {
if (!(tablePath.exists())) tablePath.mkdirs();
for (int i = 0; i < ranking.length - 1; i++) {
filename = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx";
serverFileUtils.saveSet(new File(tablePath, filename), ranking[i], "");
serverFileUtils.saveSet(new File(tablePath, filename), "plain", ranking[i], "");
}
}

@ -55,6 +55,8 @@ import java.io.BufferedOutputStream;
import java.io.PrintWriter;
import java.util.StringTokenizer;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import java.util.Set;
import java.util.Map;
import java.util.HashSet;
@ -310,44 +312,66 @@ public final class serverFileUtils {
return set;
}
public static void saveSet(File file, Set set, String sep) throws IOException {
File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000));
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(tf));
public static void saveSet(File file, String format, Set set, String sep) throws IOException {
File tf = new File(file.toString() + ".tmp" + (System.currentTimeMillis() % 1000));
OutputStream os = null;
if ((format == null) || (format.equals("plain"))) {
os = new BufferedOutputStream(new FileOutputStream(tf));
} else if (format.equals("gzip")) {
os = new GZIPOutputStream(new FileOutputStream(tf));
} else if (format.equals("zip")) {
ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(file));
String name = file.getName();
if (name.endsWith(".zip")) name = name.substring(0, name.length() - 4);
zos.putNextEntry(new ZipEntry(name + ".txt"));
os = zos;
}
Iterator i = set.iterator();
String key;
if (i.hasNext()) {
key = i.next().toString();
bos.write(key.getBytes());
os.write(key.getBytes());
}
while (i.hasNext()) {
key = i.next().toString();
if (sep != null) bos.write(sep.getBytes());
bos.write(key.getBytes());
if (sep != null) os.write(sep.getBytes());
os.write(key.getBytes());
}
bos.close();
os.close();
file.delete();
tf.renameTo(file);
}
public static void saveSet(File file, kelondroRowSet set, String sep) throws IOException {
File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000));
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(tf));
public static void saveSet(File file, String format, kelondroRowSet set, String sep) throws IOException {
File tf = new File(file.toString() + ".tmp" + (System.currentTimeMillis() % 1000));
OutputStream os = null;
if ((format == null) || (format.equals("plain"))) {
os = new BufferedOutputStream(new FileOutputStream(tf));
} else if (format.equals("gzip")) {
os = new GZIPOutputStream(new FileOutputStream(tf));
} else if (format.equals("zip")) {
ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(file));
String name = file.getName();
if (name.endsWith(".zip")) name = name.substring(0, name.length() - 4);
zos.putNextEntry(new ZipEntry(name + ".txt"));
os = zos;
}
Iterator i = set.rows();
String key;
if (i.hasNext()) {
key = new String(((kelondroRow.Entry) i.next()).getColBytes(0));
bos.write(key.getBytes());
os.write(key.getBytes());
}
while (i.hasNext()) {
key = new String(((kelondroRow.Entry) i.next()).getColBytes(0));
if (sep != null) bos.write(sep.getBytes());
bos.write(key.getBytes());
if (sep != null) os.write(sep.getBytes());
os.write(key.getBytes());
}
bos.close();
os.close();
file.delete();
tf.renameTo(file);
}
/**
* Moves all files from a directory to another.
* @param from_dir Directory which contents will be moved.

@ -65,7 +65,6 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import java.util.zip.GZIPOutputStream;
import de.anomic.data.translator;
import de.anomic.http.httpHeader;
@ -80,6 +79,7 @@ import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool;
import de.anomic.plasma.plasmaWordIndex;
@ -932,33 +932,50 @@ public final class yacy {
* @param format String which determines the format of the file. Possible values: "html", "zip", "gzip" or "plain"
* @see urllist
*/
private static void domlist(String homePath, String format, String targetName) {
private static void domlist(String homePath, String source, String format, String targetName) {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000, 10000);
Iterator eiter = pool.loadedURL.entries(true, false, null);
HashSet doms = new HashSet();
plasmaCrawlLURL.Entry entry;
HashMap doms = new HashMap();
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
System.out.println("java -Xms<megabytes>m -Xmx<megabytes>m -classpath classes yacy -domlist [ -format { text | html } ] [ <path to DATA folder> ]");
System.out.println("i.e.");
System.out.println("java -Xms900m -Xmx900m -classpath classes yacy -domlist");
System.out.println("java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ <path to DATA folder> ]");
int c = 0;
long start = System.currentTimeMillis();
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlLURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.add(entry.url().getHost());
} catch (Exception e) {
// here an MalformedURLException may occur
// just ignore
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURL.Entry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlLURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null);
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
c++;
if (c % 10000 == 0) {
System.out.println(
}
if (source.equals("eurl")) {
Iterator eiter = pool.errorURL.entries(true, false, null);
plasmaCrawlEURL.Entry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlEURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.failreason());
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
@ -966,21 +983,24 @@ public final class yacy {
}
}
if (format.equals("html")) {
// output file in HTML format
File file = new File(root, targetName + ".html");
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
Iterator i = doms.iterator();
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
String key;
bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes());
bos.write(serverCore.crlf);
bos.write(("<html><head><title>YaCy domainlist</title></head><body>").getBytes());
bos.write(("<html><head><title>YaCy " + source + " domainlist</title></head><body>").getBytes());
bos.write(serverCore.crlf);
while (i.hasNext()) {
key = i.next().toString();
bos.write(("<a href=\"http://" + key + "\">" + key + "</a><br>").getBytes());
entry = (Map.Entry) i.next();
key = (String) entry.getKey();
bos.write(("<a href=\"http://" + key + "\">" + key + "</a>" +
((entry.getValue() == null) ? "" : ((String) entry.getValue())) + "<br>"
).getBytes());
bos.write(serverCore.crlf);
}
bos.write(("</body></html>").getBytes());
@ -988,39 +1008,20 @@ public final class yacy {
} else if (format.equals("zip")) {
// output file in plain text but compressed with ZIP
ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
File file = new File(root, targetName + ".zip");
ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
bos.putNextEntry(zipEntry);
Iterator i = doms.iterator();
String key;
while (i.hasNext()) {
key = i.next().toString();
bos.write((key).getBytes());
bos.write(serverCore.crlf);
}
bos.close();
serverFileUtils.saveSet(file, "zip", doms.keySet(), new String(serverCore.crlf));
} else if (format.equals("gzip")) {
// output file in plain text but compressed with GZIP
File file = new File(root, targetName + ".txt.gz");
GZIPOutputStream bos = new GZIPOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
Iterator i = doms.iterator();
String key;
while (i.hasNext()) {
key = i.next().toString();
bos.write((key).getBytes());
bos.write(serverCore.crlf);
}
bos.close();
}
else {
serverFileUtils.saveSet(file, "gzip", doms.keySet(), new String(serverCore.crlf));
} else {
// plain text list
File file = new File(root, targetName + ".txt");
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, doms, new String(serverCore.crlf));
serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf));
}
pool.close();
} catch (IOException e) {
@ -1028,23 +1029,42 @@ public final class yacy {
}
}
private static void urllist(String homePath, boolean html, String targetName) {
private static void urllist(String homePath, String source, boolean html, String targetName) {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000, 10000);
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURL.Entry entry;
File file = new File(root, targetName);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
while (eiter.hasNext()) {
entry = (plasmaCrawlLURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.descr() + "</a><br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(serverCore.crlf);
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.descr() + "</a><br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(serverCore.crlf);
}
}
}
}
if (source.equals("eurl")) {
Iterator eiter = pool.errorURL.entries(true, false, null);
plasmaCrawlEURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlEURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + entry.failreason() + "<br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(serverCore.crlf);
}
}
}
}
@ -1272,18 +1292,33 @@ public final class yacy {
transferCR(targetaddress, crfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-domlist"))) {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
if ((args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
}
String format = "txt";
if (args.length >= 3 && args[1].toLowerCase().equals("-format")) {
if (args[2].equals("html")) format = args[2];
if (args[2].equals("zip")) format = args[2];
if (args[2].equals("gzip")) format = args[2];
if ((args[2].equals("html")) ||
(args[2].equals("zip")) ||
(args[2].equals("gzip")))
format = args[2];
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
String outfile = "domlist_" + System.currentTimeMillis();
domlist(applicationRoot, format, outfile);
domlist(applicationRoot, source, format, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urllist"))) {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
if ((args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
}
boolean html = false;
if (args.length >= 3 && args[1].toLowerCase().equals("-format")) {
if (args[2].equals("html")) html = true;
@ -1291,7 +1326,7 @@ public final class yacy {
}
if (args.length == 2) applicationRoot= args[1];
String outfile = "urllist_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, html, outfile);
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];

Loading…
Cancel
Save