From bfe51c722893855878101e31f8dd07001e343010 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 21 Nov 2005 01:30:30 +0000 Subject: [PATCH] added generation of domain-list git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1112 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../de/anomic/kelondro/kelondroRecords.java | 2 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 28 ++++++++------ source/yacy.java | 37 ++++++++++++++++--- 3 files changed, 49 insertions(+), 18 deletions(-) diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index 2c87d6eb3..9ed4f2043 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -494,7 +494,7 @@ public class kelondroRecords { try { parentNode.setOHHandle(referenceInParent, null); parentNode.commit(CP_NONE); - throw new kelondroException(filename, "INTERNAL ERROR, Node/init: node handle index " + handle.index + " exceeds size. The bad node has been auto-fixed"); + logWarning("INTERNAL ERROR, Node/init in " + filename + ": node handle index " + handle.index + " exceeds size. The bad node has been auto-fixed"); } catch (IOException ee) { throw new kelondroException(filename, "INTERNAL ERROR, Node/init: node handle index " + handle.index + " exceeds size. It was tried to fix the bad node, but failed with an IOException: " + ee.getMessage()); } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 77ea2b2c4..eb85b2fad 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -58,7 +58,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.Enumeration; +import java.util.Iterator; import java.util.LinkedList; import java.util.Locale; import java.util.Properties; @@ -716,25 +716,31 @@ public final class plasmaCrawlLURL extends plasmaURL { } } // class Entry - public class kenum implements Enumeration { + public class kiter implements Iterator { // enumerates entry elements kelondroTree.rowIterator i; - public kenum(boolean up, boolean rotating) throws IOException { + public kiter(boolean up, boolean rotating) throws IOException { i = urlHashCache.rows(up, rotating); } - public boolean hasMoreElements() { + public boolean hasNext() { return i.hasNext(); } - public Object nextElement() { - return new Entry(new String(((byte[][])i.next())[0])); + public Object next() { + byte[] e = ((byte[][])i.next())[0]; + if (e == null) return null; else return new Entry(new String(e)); } + + public void remove() { + i.remove(); + } + } - public Enumeration elements(boolean up, boolean rotating) throws IOException { + public Iterator entries(boolean up, boolean rotating) throws IOException { // enumerates entry elements - return new kenum(up, rotating); + return new kiter(up, rotating); } public static void main(String[] args) { @@ -748,9 +754,9 @@ public final class plasmaCrawlLURL extends plasmaURL { if (args[0].equals("-l")) try { // arg 1 is path to URLCache final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1); - final Enumeration enu = urls.elements(true, false); - while (enu.hasMoreElements()) { - ((Entry) enu.nextElement()).print(); + final Iterator enu = urls.entries(true, false); + while (enu.hasNext()) { + ((Entry) enu.next()).print(); } } catch (Exception e) { e.printStackTrace(); diff --git a/source/yacy.java b/source/yacy.java index 3e565cf0e..8ef6b98a0 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -69,6 +69,7 @@ import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; +import de.anomic.plasma.plasmaURLPool; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndexCache; import de.anomic.plasma.plasmaWordIndexClassicDB; @@ -1054,6 +1055,25 @@ public final class yacy { } } + private static void domlist(String homePath, String targetName) { + File root = new File(homePath); + try { + plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000); + Iterator eiter = pool.loadedURL.entries(true, false); + HashSet doms = new HashSet(); + plasmaCrawlLURL.Entry entry; + URL url; + while (eiter.hasNext()) { + entry = (plasmaCrawlLURL.Entry) eiter.next(); + if ((entry != null) && (entry.url() != null)) doms.add(entry.url().getHost()); + } + serverFileUtils.saveSet(new File(root, targetName), doms, new String(serverCore.crlf)); + pool.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + /** * Main-method which is started by java. Checks for special arguments or * starts up the application. @@ -1094,15 +1114,15 @@ public final class yacy { } else if ((args.length >= 1) && (args[0].equals("-importDB"))) { // attention: this may run long and should not be interrupted! String importRoot = null; - if (args.length == 3) { + if (args.length == 3) { applicationRoot= args[1]; - importRoot = args[2]; + importRoot = args[2]; } else if (args.length == 2) { importRoot = args[1]; } else { System.err.println("Usage: -importDB [homeDbRoot] importDbRoot"); } - importDB(applicationRoot, importRoot); + importDB(applicationRoot, importRoot); } else if ((args.length >= 1) && (args[0].equals("-deletestopwords"))) { // delete those words in the index that are listed in the stopwords file if (args.length == 2) applicationRoot= args[1]; @@ -1121,9 +1141,14 @@ public final class yacy { cleanwordlist(args[1], minlength, maxlength); } else if ((args.length >= 1) && (args[0].equals("-transfercr"))) { // transfer a single cr file to a remote peer - String targetaddress = args[1]; - String crfile = args[2]; - transferCR(targetaddress, crfile); + String targetaddress = args[1]; + String crfile = args[2]; + transferCR(targetaddress, crfile); + } else if ((args.length >= 1) && (args[0].equals("-domlist"))) { + // generate a url list and save it in a file + if (args.length == 2) applicationRoot= args[1]; + String outfile = "domlist_" + System.currentTimeMillis() + ".txt"; + domlist(applicationRoot, outfile); } else { if (args.length == 1) applicationRoot= args[0]; startup(applicationRoot, startupMemFree, startupMemTotal);