From 445c0b53337abf269309ec94d4f989b523925ec6 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 22 Nov 2007 20:47:06 +0000 Subject: [PATCH] added domain list extraction and html export format to URL administration menu http://localhost:8080/IndexControlURLs_p.html git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4228 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlURLs_p.html | 12 +- htroot/IndexControlURLs_p.java | 16 +- htroot/yacysearch.java | 2 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 68 ++++++--- source/yacy.java | 147 ------------------- 5 files changed, 72 insertions(+), 173 deletions(-) diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index 9c69bca98..ca2b73eef 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -17,7 +17,7 @@ -
Rertieve by URL-Hash:
+
Retrieve by URL-Hash:
@@ -51,8 +51,14 @@
Export Format
-
XML (RSS)   - Plain Text List (URLs only) +
Only Domain: + Plain Text List (domains only)   + HTML (domains as URLs, no title)
+ Full URL List: + Plain Text List (URLs only)        + HTML (URLs with title)       + XML (RSS) +
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index f4c4f2475..f9af2d47d 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -191,15 +191,25 @@ public class IndexControlURLs_p { } if (post.containsKey("lurlexport")) { - boolean rss = post.get("format", "text").equals("rss"); + // parse format + int format = 0; + String fname = post.get("format", "url-text"); + boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, othervise only the domain + if (fname.endsWith("text")) format = 0; + if (fname.endsWith("html")) format = 1; + if (fname.endsWith("rss")) format = 2; + + // extend export file name String s = post.get("exportfile", ""); if (s.indexOf('.') < 0) { - if (rss) s = s + ".xml"; else s = s + ".txt"; + if (format == 0) s = s + ".txt"; + if (format == 1) s = s + ".html"; + if (format == 2) s = s + ".xml"; } File f = new File(s); f.getParentFile().mkdirs(); String filter = post.get("exportfilter", ".*"); - boolean running = sb.wordIndex.loadedURL.export(f, filter, rss); + boolean running = sb.wordIndex.loadedURL.export(f, filter, format, dom); prop.put("lurlexport_exportfile", s); prop.put("lurlexport_urlcount", sb.wordIndex.loadedURL.export_count()); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index dd403ef9b..0c18a19b7 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -267,7 +267,7 @@ public class yacysearch { "", 20, constraint, - false); + true); serverProfiling localTiming = new serverProfiling(4 * theQuery.maximumTime / 10, theQuery.displayResults()); String client = (String) header.get("CLIENTIP"); // the search client who initiated the search diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 6938e5cfc..cbc0f8cd1 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -66,12 +66,14 @@ import de.anomic.http.httpc; import de.anomic.http.httpc.response; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexURLEntry; +import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCloneableIterator; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroFlexSplitTable; import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroRow; +import de.anomic.kelondro.kelondroRowSet; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCodings; import de.anomic.server.logging.serverLog; @@ -534,12 +536,12 @@ public final class plasmaCrawlLURL { private exportc exportthread = null; - public boolean export(File f, String filter, boolean rss) { + public boolean export(File f, String filter, int format, boolean dom) { if ((exportthread != null) && (exportthread.isAlive())) { serverLog.logWarning("LURL-EXPORT", "cannot start another export thread, already one running"); return false; } - this.exportthread = new exportc(f, filter, rss); + this.exportthread = new exportc(f, filter, format, dom); this.exportthread.start(); return (this.exportthread.isAlive()); } @@ -569,21 +571,30 @@ public final class plasmaCrawlLURL { String filter; int count; String failure; - boolean rss; + int format; + boolean dom; + kelondroRowSet doms; - public exportc(File f, String filter, boolean rss) { + public exportc(File f, String filter, int format, boolean dom) { + // format: 0=text, 1=html, 2=rss/xml this.f = f; this.filter = filter; this.count = 0; this.failure = null; - this.rss = rss; + this.format = format; + this.dom = dom; + if ((dom) && (format == 2)) dom = false; + this.doms = new kelondroRowSet(new kelondroRow("String hash-6", kelondroBase64Order.enhancedCoder, 0), 0); } public void run() { try { f.getParentFile().mkdirs(); PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(f))); - if (rss) { + if (format == 1) { + pw.println(""); + } + if (format == 2) { pw.println(""); pw.println(""); pw.println(""); @@ -597,26 +608,45 @@ public final class plasmaCrawlLURL { indexURLEntry entry; indexURLEntry.Components comp; String url; - while (i.hasNext()) { + loop: while (i.hasNext()) { entry = (indexURLEntry) i.next(); comp = entry.comp(); url = comp.url().toNormalform(true, false); if (!url.matches(filter)) continue; - if (rss) { - pw.println(""); - pw.println("" + yacyURL.escape(comp.title()) + ""); - pw.println("" + url + ""); - if (comp.author().length() > 0) pw.println("" + comp.author() + ""); - if (comp.tags().length() > 0) pw.println("" + comp.tags() + ""); - pw.println("" + entry.moddate().toString() + ""); - pw.println("" + entry.hash() + ""); - pw.println(""); + if (dom) { + if (doms.has(entry.hash().substring(6).getBytes())) continue loop; + doms.add(entry.hash().substring(6).getBytes()); + url = comp.url().getHost(); + if (format == 0) { + pw.println(url); + } + if (format == 1) { + pw.println("" + url + "
"); + } } else { - pw.println(url); + if (format == 0) { + pw.println(url); + } + if (format == 1) { + pw.println("" + comp.title() + "
"); + } + if (format == 2) { + pw.println(""); + pw.println("" + comp.title() + ""); + pw.println("" + yacyURL.escape(url) + ""); + if (comp.author().length() > 0) pw.println("" + comp.author() + ""); + if (comp.tags().length() > 0) pw.println("" + comp.tags() + ""); + pw.println("" + entry.moddate().toString() + ""); + pw.println("" + entry.hash() + ""); + pw.println(""); + } } - count++; + count++; } - if (rss) { + if (format == 1) { + pw.println(""); + } + if (format == 2) { pw.println(""); pw.println("
"); } diff --git a/source/yacy.java b/source/yacy.java index 5198d32d8..c693e5706 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -75,10 +75,7 @@ import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMapObjects; import de.anomic.plasma.plasmaCondenser; -import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlLURL; -import de.anomic.plasma.plasmaCrawlNURL; -import de.anomic.plasma.plasmaCrawlZURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverCore; @@ -766,129 +763,6 @@ public final class yacy { serverLog.logInfo("TRANSFER-CR", "could not read file " + crfile); } } - /** - * Generates a text file containing all domains in this peer's DB. - * This may be useful to calculate the YaCy-Blockrank. - * - * @param format String which determines the format of the file. Possible values: "html", "zip", "gzip" or "plain" - * @see urllist - */ - private static void domlist(String homePath, String source, String format, String targetName) { - - File root = new File(homePath); - try { - final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf", false); - HashMap doms = new HashMap(); - System.out.println("Started domain list extraction from " + sb.wordIndex.loadedURL.size() + " url entries."); - System.out.println("a dump will be written after double-check of all extracted domains."); - System.out.println("This process may fail in case of too less memory. To increase memory, start with"); - System.out.println("java -Xmxm -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ ]"); - int c = 0; - long start = System.currentTimeMillis(); - if (source.equals("lurl")) { - Iterator eiter = sb.wordIndex.loadedURL.entries(true, null); - indexURLEntry entry; - while (eiter.hasNext()) { - try { - entry = (indexURLEntry) eiter.next(); - indexURLEntry.Components comp = entry.comp(); - if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null); - } catch (Exception e) { - // here a MalformedURLException may occur - // just ignore - } - c++; - if (c % 10000 == 0) System.out.println( - c + " urls checked, " + - doms.size() + " domains collected, " + - ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + - ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining."); - } - } - if (source.equals("eurl")) { - Iterator eiter = sb.crawlQueues.errorURL.entries(true, null); - plasmaCrawlZURL.Entry entry; - while (eiter.hasNext()) { - try { - entry = (plasmaCrawlZURL.Entry) eiter.next(); - if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.anycause()); - } catch (Exception e) { - // here a MalformedURLException may occur - // just ignore - } - c++; - if (c % 10000 == 0) System.out.println( - c + " urls checked, " + - doms.size() + " domains collected, " + - ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + - ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining."); - } - } - if (source.equals("nurl")) { - Iterator eiter = sb.crawlQueues.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); - plasmaCrawlEntry entry; - while (eiter.hasNext()) { - try { - entry = (plasmaCrawlEntry) eiter.next(); - if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth()); - } catch (Exception e) { - // here a MalformedURLException may occur - // just ignore - } - c++; - if (c % 10000 == 0) System.out.println( - c + " urls checked, " + - doms.size() + " domains collected, " + - ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + - ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining."); - } - } - - if (format.equals("html")) { - // output file in HTML format - File file = new File(root, targetName + ".html"); - BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); - System.out.println("Started domain list dump to file " + file); - Iterator i = doms.entrySet().iterator(); - Map.Entry entry; - String key; - bos.write(("").getBytes()); - bos.write(serverCore.crlf); - bos.write(("YaCy " + source + " domainlist").getBytes()); - bos.write(serverCore.crlf); - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - key = (String) entry.getKey(); - bos.write(("" + key + "" + - ((entry.getValue() == null) ? "" : (" " + ((String) entry.getValue()))) + "
" - ).getBytes()); - bos.write(serverCore.crlf); - } - bos.write(("").getBytes()); - bos.close(); - - } else if (format.equals("zip")) { - // output file in plain text but compressed with ZIP - File file = new File(root, targetName + ".zip"); - System.out.println("Started domain list dump to file " + file); - serverFileUtils.saveSet(file, "zip", doms.keySet(), new String(serverCore.crlf)); - - } else if (format.equals("gzip")) { - // output file in plain text but compressed with GZIP - File file = new File(root, targetName + ".txt.gz"); - System.out.println("Started domain list dump to file " + file); - serverFileUtils.saveSet(file, "gzip", doms.keySet(), new String(serverCore.crlf)); - } else { - // plain text list - File file = new File(root, targetName + ".txt"); - System.out.println("Started domain list dump to file " + file); - serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf)); - } - sb.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } private static String[] shift(String[] args, int pos, int count) { String[] newargs = new String[args.length - count]; @@ -1082,27 +956,6 @@ public final class yacy { String targetaddress = args[1]; String crfile = args[2]; transferCR(targetaddress, crfile); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-domlist"))) { - // generate a url list and save it in a file - String source = "lurl"; - if (args.length >= 3 && args[1].toLowerCase().equals("-source")) { - if ((args[2].equals("nurl")) || - (args[2].equals("lurl")) || - (args[2].equals("eurl"))) - source = args[2]; - args = shift(args, 1, 2); - } - String format = "txt"; - if (args.length >= 3 && args[1].toLowerCase().equals("-format")) { - if ((args[2].equals("html")) || - (args[2].equals("zip")) || - (args[2].equals("gzip"))) - format = args[2]; - args = shift(args, 1, 2); - } - if (args.length == 2) applicationRoot= args[1]; - String outfile = "domlist_" + source + "_" + System.currentTimeMillis(); - domlist(applicationRoot, source, format, outfile); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { // generate a url list and save it in a file if (args.length == 2) applicationRoot= args[1];