diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 77118ec22..4c53d661c 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -46,8 +46,8 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; import de.anomic.net.URL; + import java.util.Date; -import java.util.Enumeration; import java.util.HashMap; import java.util.LinkedList; import java.util.Iterator; @@ -115,7 +115,7 @@ public class plasmaCrawlEURL extends indexURL { } public synchronized Entry getEntry(String hash) throws IOException { - return new Entry(hash); + return new Entry(hash); } public boolean exists(String urlHash) { @@ -154,21 +154,21 @@ public class plasmaCrawlEURL extends indexURL { private String failreason; // string describing reason for load fail private bitfield flags; // extra space - public Entry(URL url, String referrer, String initiator, String executor, String name, String failreason, bitfield flags) { - // create new entry and store it into database - this.hash = urlHash(url); - this.referrer = (referrer == null) ? dummyHash : referrer; - this.initiator = initiator; - this.executor = executor; - this.url = url; - this.name = name; - this.initdate = new Date(); - this.trydate = new Date(); - this.trycount = 0; - this.failreason = failreason; - this.flags = flags; - - } + public Entry(URL url, String referrer, String initiator, + String executor, String name, String failreason, bitfield flags) { + // create new entry and store it into database + this.hash = urlHash(url); + this.referrer = (referrer == null) ? dummyHash : referrer; + this.initiator = initiator; + this.executor = executor; + this.url = url; + this.name = name; + this.initdate = new Date(); + this.trydate = new Date(); + this.trycount = 0; + this.failreason = failreason; + this.flags = flags; + } public Entry(String hash) throws IOException { // generates an plasmaEURLEntry using the url hash @@ -181,30 +181,40 @@ public class plasmaCrawlEURL extends indexURL { this.hash = hash; kelondroRow.Entry entry = urlHashCache.get(hash.getBytes()); if (entry != null) { - this.referrer = entry.getColString(1, "UTF-8"); - this.initiator = entry.getColString(2, "UTF-8"); - this.executor = entry.getColString(3, "UTF-8"); - this.url = new URL(entry.getColString(4, "UTF-8").trim()); - this.name = entry.getColString(5, "UTF-8").trim(); - this.initdate = new Date(86400000 * entry.getColLongB64E(6)); - this.trydate = new Date(86400000 * entry.getColLongB64E(7)); - this.trycount = (int) entry.getColLongB64E(8); - this.failreason = entry.getColString(9, "UTF-8"); - this.flags = new bitfield(entry.getColBytes(10)); - return; + insertEntry(entry); } } + + public Entry(kelondroRow.Entry entry) throws IOException { + insertEntry(entry); + } - private void store() { - // stores the values from the object variables into the database + private void insertEntry(kelondroRow.Entry entry) throws IOException { + assert (entry != null); + this.hash = entry.getColString(0, null); + this.referrer = entry.getColString(1, "UTF-8"); + this.initiator = entry.getColString(2, "UTF-8"); + this.executor = entry.getColString(3, "UTF-8"); + this.url = new URL(entry.getColString(4, "UTF-8").trim()); + this.name = entry.getColString(5, "UTF-8").trim(); + this.initdate = new Date(86400000 * entry.getColLongB64E(6)); + this.trydate = new Date(86400000 * entry.getColLongB64E(7)); + this.trycount = (int) entry.getColLongB64E(8); + this.failreason = entry.getColString(9, "UTF-8"); + this.flags = new bitfield(entry.getColBytes(10)); + return; + } + + private void store() { + // stores the values from the object variables into the database String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength); String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength); - // store the hash in the hash cache - try { - // even if the entry exists, we simply overwrite it - byte[][] entry = new byte[][] { - this.hash.getBytes(), + // store the hash in the hash cache + try { + // even if the entry exists, we simply overwrite it + byte[][] entry = new byte[][] { + this.hash.getBytes(), this.referrer.getBytes(), this.initiator.getBytes(), this.executor.getBytes(), @@ -215,12 +225,12 @@ public class plasmaCrawlEURL extends indexURL { kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, urlRetryLength).getBytes(), this.failreason.getBytes(), this.flags.getBytes() - }; - urlHashCache.put(urlHashCache.row().newEntry(entry)); - } catch (IOException e) { - System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString()); + }; + urlHashCache.put(urlHashCache.row().newEntry(entry)); + } catch (IOException e) { + System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString()); + } } - } public String hash() { // return a url-hash, based on the md5 algorithm @@ -267,27 +277,39 @@ public class plasmaCrawlEURL extends indexURL { } - public class kenum implements Enumeration { + public class kiter implements Iterator { // enumerates entry elements Iterator i; - public kenum(boolean up, boolean rotating) throws IOException { - i = urlHashCache.rows(up, rotating, null); + boolean error = false; + + public kiter(boolean up, boolean rotating, String firstHash) throws IOException { + i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); + error = false; } - public boolean hasMoreElements() { + + public boolean hasNext() { + if (error) return false; return i.hasNext(); } - public Object nextElement() { + + public Object next() throws RuntimeException { + kelondroRow.Entry e = (kelondroRow.Entry) i.next(); + if (e == null) return null; try { - return new Entry(new String(((byte[][]) i.next())[0])); - } catch (IOException e) { - return null; + return new Entry(e); + } catch (IOException ex) { + throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); } } + + public void remove() { + i.remove(); + } + } - - public Enumeration elements(boolean up, boolean rotating) throws IOException { - // enumerates entry elements - return new kenum(up, rotating); + + public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException { + // enumerates entry elements + return new kiter(up, rotating, firstHash); } - } diff --git a/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java b/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java index 06e007a13..a3df1323d 100644 --- a/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java +++ b/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java @@ -188,7 +188,7 @@ public class plasmaRankingRCIEvaluation { if (!(tablePath.exists())) tablePath.mkdirs(); for (int i = 0; i < ranking.length - 1; i++) { filename = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx"; - serverFileUtils.saveSet(new File(tablePath, filename), ranking[i], ""); + serverFileUtils.saveSet(new File(tablePath, filename), "plain", ranking[i], ""); } } diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index dd6819a5f..e33561c69 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -55,6 +55,8 @@ import java.io.BufferedOutputStream; import java.io.PrintWriter; import java.util.StringTokenizer; import java.util.zip.GZIPOutputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; import java.util.Set; import java.util.Map; import java.util.HashSet; @@ -310,44 +312,66 @@ public final class serverFileUtils { return set; } - public static void saveSet(File file, Set set, String sep) throws IOException { - File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000)); - BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(tf)); + public static void saveSet(File file, String format, Set set, String sep) throws IOException { + File tf = new File(file.toString() + ".tmp" + (System.currentTimeMillis() % 1000)); + OutputStream os = null; + if ((format == null) || (format.equals("plain"))) { + os = new BufferedOutputStream(new FileOutputStream(tf)); + } else if (format.equals("gzip")) { + os = new GZIPOutputStream(new FileOutputStream(tf)); + } else if (format.equals("zip")) { + ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(file)); + String name = file.getName(); + if (name.endsWith(".zip")) name = name.substring(0, name.length() - 4); + zos.putNextEntry(new ZipEntry(name + ".txt")); + os = zos; + } Iterator i = set.iterator(); String key; if (i.hasNext()) { key = i.next().toString(); - bos.write(key.getBytes()); + os.write(key.getBytes()); } while (i.hasNext()) { key = i.next().toString(); - if (sep != null) bos.write(sep.getBytes()); - bos.write(key.getBytes()); + if (sep != null) os.write(sep.getBytes()); + os.write(key.getBytes()); } - bos.close(); + os.close(); file.delete(); tf.renameTo(file); } - public static void saveSet(File file, kelondroRowSet set, String sep) throws IOException { - File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000)); - BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(tf)); + public static void saveSet(File file, String format, kelondroRowSet set, String sep) throws IOException { + File tf = new File(file.toString() + ".tmp" + (System.currentTimeMillis() % 1000)); + OutputStream os = null; + if ((format == null) || (format.equals("plain"))) { + os = new BufferedOutputStream(new FileOutputStream(tf)); + } else if (format.equals("gzip")) { + os = new GZIPOutputStream(new FileOutputStream(tf)); + } else if (format.equals("zip")) { + ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(file)); + String name = file.getName(); + if (name.endsWith(".zip")) name = name.substring(0, name.length() - 4); + zos.putNextEntry(new ZipEntry(name + ".txt")); + os = zos; + } Iterator i = set.rows(); String key; if (i.hasNext()) { key = new String(((kelondroRow.Entry) i.next()).getColBytes(0)); - bos.write(key.getBytes()); + os.write(key.getBytes()); } while (i.hasNext()) { key = new String(((kelondroRow.Entry) i.next()).getColBytes(0)); - if (sep != null) bos.write(sep.getBytes()); - bos.write(key.getBytes()); + if (sep != null) os.write(sep.getBytes()); + os.write(key.getBytes()); } - bos.close(); + os.close(); file.delete(); tf.renameTo(file); } - + /** * Moves all files from a directory to another. * @param from_dir Directory which contents will be moved. diff --git a/source/yacy.java b/source/yacy.java index e42702c04..c1d0e1f01 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -65,7 +65,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -import java.util.zip.GZIPOutputStream; import de.anomic.data.translator; import de.anomic.http.httpHeader; @@ -80,6 +79,7 @@ import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURLPool; import de.anomic.plasma.plasmaWordIndex; @@ -932,33 +932,50 @@ public final class yacy { * @param format String which determines the format of the file. Possible values: "html", "zip", "gzip" or "plain" * @see urllist */ - private static void domlist(String homePath, String format, String targetName) { + private static void domlist(String homePath, String source, String format, String targetName) { File root = new File(homePath); try { plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000, 10000); - Iterator eiter = pool.loadedURL.entries(true, false, null); - HashSet doms = new HashSet(); - plasmaCrawlLURL.Entry entry; + HashMap doms = new HashMap(); System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries."); System.out.println("a dump will be written after double-check of all extracted domains."); System.out.println("This process may fail in case of too less memory. To increase memory, start with"); - System.out.println("java -Xmsm -Xmxm -classpath classes yacy -domlist [ -format { text | html } ] [ ]"); - System.out.println("i.e."); - System.out.println("java -Xms900m -Xmx900m -classpath classes yacy -domlist"); + System.out.println("java -Xmxm -classpath classes yacy -domlist [ -source { lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ ]"); int c = 0; long start = System.currentTimeMillis(); - while (eiter.hasNext()) { - try { - entry = (plasmaCrawlLURL.Entry) eiter.next(); - if ((entry != null) && (entry.url() != null)) doms.add(entry.url().getHost()); - } catch (Exception e) { - // here an MalformedURLException may occur - // just ignore + if (source.equals("lurl")) { + Iterator eiter = pool.loadedURL.entries(true, false, null); + plasmaCrawlLURL.Entry entry; + while (eiter.hasNext()) { + try { + entry = (plasmaCrawlLURL.Entry) eiter.next(); + if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null); + } catch (Exception e) { + // here a MalformedURLException may occur + // just ignore + } + c++; + if (c % 10000 == 0) System.out.println( + c + " urls checked, " + + doms.size() + " domains collected, " + + ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + + ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining."); } - c++; - if (c % 10000 == 0) { - System.out.println( + } + if (source.equals("eurl")) { + Iterator eiter = pool.errorURL.entries(true, false, null); + plasmaCrawlEURL.Entry entry; + while (eiter.hasNext()) { + try { + entry = (plasmaCrawlEURL.Entry) eiter.next(); + if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.failreason()); + } catch (Exception e) { + // here a MalformedURLException may occur + // just ignore + } + c++; + if (c % 10000 == 0) System.out.println( c + " urls checked, " + doms.size() + " domains collected, " + ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + @@ -966,21 +983,24 @@ public final class yacy { } } - if (format.equals("html")) { // output file in HTML format File file = new File(root, targetName + ".html"); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); System.out.println("Started domain list dump to file " + file); - Iterator i = doms.iterator(); + Iterator i = doms.entrySet().iterator(); + Map.Entry entry; String key; bos.write(("").getBytes()); bos.write(serverCore.crlf); - bos.write(("YaCy domainlist").getBytes()); + bos.write(("YaCy " + source + " domainlist").getBytes()); bos.write(serverCore.crlf); while (i.hasNext()) { - key = i.next().toString(); - bos.write(("" + key + "
").getBytes()); + entry = (Map.Entry) i.next(); + key = (String) entry.getKey(); + bos.write(("" + key + "" + + ((entry.getValue() == null) ? "" : ((String) entry.getValue())) + "
" + ).getBytes()); bos.write(serverCore.crlf); } bos.write(("").getBytes()); @@ -988,39 +1008,20 @@ public final class yacy { } else if (format.equals("zip")) { // output file in plain text but compressed with ZIP - ZipEntry zipEntry = new ZipEntry(targetName + ".txt"); File file = new File(root, targetName + ".zip"); - ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file)); System.out.println("Started domain list dump to file " + file); - bos.putNextEntry(zipEntry); - Iterator i = doms.iterator(); - String key; - while (i.hasNext()) { - key = i.next().toString(); - bos.write((key).getBytes()); - bos.write(serverCore.crlf); - } - bos.close(); + serverFileUtils.saveSet(file, "zip", doms.keySet(), new String(serverCore.crlf)); } else if (format.equals("gzip")) { // output file in plain text but compressed with GZIP File file = new File(root, targetName + ".txt.gz"); - GZIPOutputStream bos = new GZIPOutputStream(new FileOutputStream(file)); System.out.println("Started domain list dump to file " + file); - Iterator i = doms.iterator(); - String key; - while (i.hasNext()) { - key = i.next().toString(); - bos.write((key).getBytes()); - bos.write(serverCore.crlf); - } - bos.close(); - } - else { + serverFileUtils.saveSet(file, "gzip", doms.keySet(), new String(serverCore.crlf)); + } else { // plain text list File file = new File(root, targetName + ".txt"); System.out.println("Started domain list dump to file " + file); - serverFileUtils.saveSet(file, doms, new String(serverCore.crlf)); + serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf)); } pool.close(); } catch (IOException e) { @@ -1028,23 +1029,42 @@ public final class yacy { } } - private static void urllist(String homePath, boolean html, String targetName) { + private static void urllist(String homePath, String source, boolean html, String targetName) { File root = new File(homePath); try { plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000, 10000); - Iterator eiter = pool.loadedURL.entries(true, false, null); - plasmaCrawlLURL.Entry entry; File file = new File(root, targetName); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); - while (eiter.hasNext()) { - entry = (plasmaCrawlLURL.Entry) eiter.next(); - if ((entry != null) && (entry.url() != null)) { - if (html) { - bos.write(("" + entry.descr() + "
").getBytes("UTF-8")); - bos.write(serverCore.crlf); - } else { - bos.write(entry.url().toString().getBytes()); - bos.write(serverCore.crlf); + + if (source.equals("lurl")) { + Iterator eiter = pool.loadedURL.entries(true, false, null); + plasmaCrawlLURL.Entry entry; + while (eiter.hasNext()) { + entry = (plasmaCrawlLURL.Entry) eiter.next(); + if ((entry != null) && (entry.url() != null)) { + if (html) { + bos.write(("" + entry.descr() + "
").getBytes("UTF-8")); + bos.write(serverCore.crlf); + } else { + bos.write(entry.url().toString().getBytes()); + bos.write(serverCore.crlf); + } + } + } + } + if (source.equals("eurl")) { + Iterator eiter = pool.errorURL.entries(true, false, null); + plasmaCrawlEURL.Entry entry; + while (eiter.hasNext()) { + entry = (plasmaCrawlEURL.Entry) eiter.next(); + if ((entry != null) && (entry.url() != null)) { + if (html) { + bos.write(("" + entry.url() + " " + entry.failreason() + "
").getBytes("UTF-8")); + bos.write(serverCore.crlf); + } else { + bos.write(entry.url().toString().getBytes()); + bos.write(serverCore.crlf); + } } } } @@ -1272,18 +1292,33 @@ public final class yacy { transferCR(targetaddress, crfile); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-domlist"))) { // generate a url list and save it in a file + String source = "lurl"; + if (args.length >= 3 && args[1].toLowerCase().equals("-source")) { + if ((args[2].equals("lurl")) || + (args[2].equals("eurl"))) + source = args[2]; + args = shift(args, 1, 2); + } String format = "txt"; if (args.length >= 3 && args[1].toLowerCase().equals("-format")) { - if (args[2].equals("html")) format = args[2]; - if (args[2].equals("zip")) format = args[2]; - if (args[2].equals("gzip")) format = args[2]; + if ((args[2].equals("html")) || + (args[2].equals("zip")) || + (args[2].equals("gzip"))) + format = args[2]; args = shift(args, 1, 2); } if (args.length == 2) applicationRoot= args[1]; String outfile = "domlist_" + System.currentTimeMillis(); - domlist(applicationRoot, format, outfile); + domlist(applicationRoot, source, format, outfile); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urllist"))) { // generate a url list and save it in a file + String source = "lurl"; + if (args.length >= 3 && args[1].toLowerCase().equals("-source")) { + if ((args[2].equals("lurl")) || + (args[2].equals("eurl"))) + source = args[2]; + args = shift(args, 1, 2); + } boolean html = false; if (args.length >= 3 && args[1].toLowerCase().equals("-format")) { if (args[2].equals("html")) html = true; @@ -1291,7 +1326,7 @@ public final class yacy { } if (args.length == 2) applicationRoot= args[1]; String outfile = "urllist_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt"); - urllist(applicationRoot, html, outfile); + urllist(applicationRoot, source, html, outfile); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { // generate a url list and save it in a file if (args.length == 2) applicationRoot= args[1];