added domain list extraction and html export format

to URL administration menu http://localhost:8080/IndexControlURLs_p.html git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4228 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · 445c0b5333
parent d8d77fc4b2
commit 445c0b5333
5 changed files with 72 additions and 173 deletions
--- a/htroot/IndexControlURLs_p.html
+++ b/htroot/IndexControlURLs_p.html
@ -17,7 +17,7 @@
            <input type="submit" name="urlstringsearch" value="Show Details for URL" />
        </dd>
        
-        <dt class="TableCellDark">Rertieve by URL-Hash:</dt>
+        <dt class="TableCellDark">Retrieve by URL-Hash:</dt>
        <dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
            <input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" />
            <input type="submit" name="urlhashsimilar" value="Generate List" />
@ -51,8 +51,14 @@
        <dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
        </dd>
        <dt class="TableCellDark">Export Format</dt>
-        <dd><input type="radio" name="format" value="rss" checked />XML (RSS)&nbsp;&nbsp;
-            <input type="radio" name="format" value="text" />Plain Text List (URLs only)
+        <dd>Only Domain:
+            <input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;
+            <input type="radio" name="format" value="dom-html" />HTML (domains as URLs, no title)<br>
+            Full URL List:
+            <input type="radio" name="format" value="url-text" />Plain Text List (URLs only)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+            <input type="radio" name="format" value="url-html" />HTML (URLs with title)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+            <input type="radio" name="format" value="url-rss" checked />XML (RSS)
+            </br>
        </dd>
        <dt class="TableCellLight"></dt>
        <dd><input type="submit" name="lurlexport" value="Export URLs" />
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@ -191,15 +191,25 @@ public class IndexControlURLs_p {
        }
        
        if (post.containsKey("lurlexport")) {
-        	boolean rss = post.get("format", "text").equals("rss");
+            // parse format
+            int format = 0;
+            String fname = post.get("format", "url-text");
+            boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, othervise only the domain
+            if (fname.endsWith("text")) format = 0;
+            if (fname.endsWith("html")) format = 1;
+            if (fname.endsWith("rss")) format = 2;
+            
+            // extend export file name
 			String s = post.get("exportfile", "");
 			if (s.indexOf('.') < 0) {
-				if (rss) s = s + ".xml"; else s = s + ".txt";
+				if (format == 0) s = s + ".txt";
+				if (format == 1) s = s + ".html";
+				if (format == 2) s = s + ".xml";
 			}
        	File f = new File(s);
 			f.getParentFile().mkdirs();
 			String filter = post.get("exportfilter", ".*");
-			boolean running = sb.wordIndex.loadedURL.export(f, filter, rss);
+			boolean running = sb.wordIndex.loadedURL.export(f, filter, format, dom);

 			prop.put("lurlexport_exportfile", s);
 			prop.put("lurlexport_urlcount", sb.wordIndex.loadedURL.export_count());
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -267,7 +267,7 @@ public class yacysearch {
                    "",
                    20,
                    constraint,
-                    false);
+                    true);
            serverProfiling localTiming = new serverProfiling(4 * theQuery.maximumTime / 10, theQuery.displayResults());

            String client = (String) header.get("CLIENTIP"); // the search client who initiated the search
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@ -66,12 +66,14 @@ import de.anomic.http.httpc;
 import de.anomic.http.httpc.response;
 import de.anomic.index.indexRWIEntry;
 import de.anomic.index.indexURLEntry;
+import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.kelondro.kelondroCache;
 import de.anomic.kelondro.kelondroCloneableIterator;
 import de.anomic.kelondro.kelondroException;
 import de.anomic.kelondro.kelondroFlexSplitTable;
 import de.anomic.kelondro.kelondroIndex;
 import de.anomic.kelondro.kelondroRow;
+import de.anomic.kelondro.kelondroRowSet;
 import de.anomic.plasma.urlPattern.plasmaURLPattern;
 import de.anomic.server.serverCodings;
 import de.anomic.server.logging.serverLog;
@ -534,12 +536,12 @@ public final class plasmaCrawlLURL {

    private exportc exportthread = null;
    
-    public boolean export(File f, String filter, boolean rss) {
+    public boolean export(File f, String filter, int format, boolean dom) {
    	if ((exportthread != null) && (exportthread.isAlive())) {
    		serverLog.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
    		return false;
    	}
-    	this.exportthread = new exportc(f, filter, rss);
+    	this.exportthread = new exportc(f, filter, format, dom);
    	this.exportthread.start();
    	return (this.exportthread.isAlive());
    }
@ -569,21 +571,30 @@ public final class plasmaCrawlLURL {
    	String filter;
    	int count;
    	String failure;
-    	boolean rss;
+    	int format;
+    	boolean dom;
+    	kelondroRowSet doms;
    	
-    	public exportc(File f, String filter, boolean rss) {
+    	public exportc(File f, String filter, int format, boolean dom) {
+    	    // format: 0=text, 1=html, 2=rss/xml
    		this.f = f;
    		this.filter = filter;
    		this.count = 0;
    		this.failure = null;
-    		this.rss = rss;
+    		this.format = format;
+    		this.dom = dom;
+    		if ((dom) && (format == 2)) dom = false;
+    		this.doms = new kelondroRowSet(new kelondroRow("String hash-6", kelondroBase64Order.enhancedCoder, 0), 0);
    	}
    	
    	public void run() {
    		try {
    			f.getParentFile().mkdirs();
    			PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(f)));
-    			if (rss) {
+    			if (format == 1) {
+    			    pw.println("<html><head></head><body>");
+    			}
+    			if (format == 2) {
    				pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
    				pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
    				pw.println("<rss version=\"2.0\">");
@ -597,26 +608,45 @@ public final class plasmaCrawlLURL {
        		indexURLEntry entry;
        		indexURLEntry.Components comp;
        		String url;
-        		while (i.hasNext()) {
+        		loop: while (i.hasNext()) {
        			entry = (indexURLEntry) i.next();
        			comp = entry.comp();
        			url = comp.url().toNormalform(true, false);
        			if (!url.matches(filter)) continue;
-        			if (rss) {
-        				pw.println("<item>");
-        				pw.println("<title>" + yacyURL.escape(comp.title()) + "</title>");
-        				pw.println("<link>" + url + "</link>");
-        				if (comp.author().length() > 0) pw.println("<author>" + comp.author() + "</author>");
-        				if (comp.tags().length() > 0) pw.println("<description>" + comp.tags() + "</description>");
-        				pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
-        				pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
-        				pw.println("</item>");
+        			if (dom) {
+        			    if (doms.has(entry.hash().substring(6).getBytes())) continue loop;
+        			    doms.add(entry.hash().substring(6).getBytes());
+        			    url = comp.url().getHost();
+        			    if (format == 0) {
+                            pw.println(url);
+                        }
+                        if (format == 1) {
+                            pw.println("<a href=\"http://" + url + "\">" + url + "</a><br>");
+                        }
        			} else {
-        				pw.println(url);
+        			    if (format == 0) {
+        			        pw.println(url);
+        			    }
+        			    if (format == 1) {
+        			        pw.println("<a href=\"" + url + "\">" + comp.title() + "</a><br>");
+        			    }
+        			    if (format == 2) {
+        			        pw.println("<item>");
+        			        pw.println("<title>" + comp.title() + "</title>");
+        			        pw.println("<link>" + yacyURL.escape(url) + "</link>");
+        			        if (comp.author().length() > 0) pw.println("<author>" + comp.author() + "</author>");
+        			        if (comp.tags().length() > 0) pw.println("<description>" + comp.tags() + "</description>");
+        			        pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
+        			        pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
+        			        pw.println("</item>");
+        			    }
        			}
-        			count++;
+                    count++;
        		}
-        		if (rss) {
+        		if (format == 1) {
+                    pw.println("</body></html>");
+                }
+        		if (format == 2) {
    				pw.println("</channel>");
    				pw.println("</rss>");
    			}
--- a/source/yacy.java
+++ b/source/yacy.java
@ -75,10 +75,7 @@ import de.anomic.kelondro.kelondroDyn;
 import de.anomic.kelondro.kelondroMScoreCluster;
 import de.anomic.kelondro.kelondroMapObjects;
 import de.anomic.plasma.plasmaCondenser;
-import de.anomic.plasma.plasmaCrawlEntry;
 import de.anomic.plasma.plasmaCrawlLURL;
-import de.anomic.plasma.plasmaCrawlNURL;
-import de.anomic.plasma.plasmaCrawlZURL;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaWordIndex;
 import de.anomic.server.serverCore;
@ -766,129 +763,6 @@ public final class yacy {
            serverLog.logInfo("TRANSFER-CR", "could not read file " + crfile);
        }
    }
-    /**
-    * Generates a text file containing all domains in this peer's DB.
-    * This may be useful to calculate the YaCy-Blockrank.
-    *
-    * @param format String which determines the format of the file. Possible values: "html", "zip", "gzip" or "plain"
-    * @see urllist
-    */
-    private static void domlist(String homePath, String source, String format, String targetName) {
-    	
-    	File root = new File(homePath);
-        try {
-            final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf", false);
-            HashMap doms = new HashMap();
-            System.out.println("Started domain list extraction from " + sb.wordIndex.loadedURL.size() + " url entries.");
-            System.out.println("a dump will be written after double-check of all extracted domains.");
-            System.out.println("This process may fail in case of too less memory. To increase memory, start with");
-            System.out.println("java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text  | zip | gzip | html } ] [ <path to DATA folder> ]");
-            int c = 0;
-            long start = System.currentTimeMillis();
-            if (source.equals("lurl")) {
-                Iterator eiter = sb.wordIndex.loadedURL.entries(true, null);
-                indexURLEntry entry;
-                while (eiter.hasNext()) {
-                    try {
-                        entry = (indexURLEntry) eiter.next();
-                        indexURLEntry.Components comp = entry.comp();
-                        if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
-                    } catch (Exception e) {
-                        // here a MalformedURLException may occur
-                        // just ignore
-                    }
-                    c++;
-                    if (c % 10000 == 0) System.out.println(
-                            c + " urls checked, " +
-                            doms.size() + " domains collected, " +
-                            ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + 
-                            ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
-                }
-            }
-            if (source.equals("eurl")) {
-                Iterator eiter = sb.crawlQueues.errorURL.entries(true, null);
-                plasmaCrawlZURL.Entry entry;
-                while (eiter.hasNext()) {
-                    try {
-                        entry = (plasmaCrawlZURL.Entry) eiter.next();
-                        if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.anycause());
-                    } catch (Exception e) {
-                        // here a MalformedURLException may occur
-                        // just ignore
-                    }
-                    c++;
-                    if (c % 10000 == 0) System.out.println(
-                            c + " urls checked, " +
-                            doms.size() + " domains collected, " +
-                            ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + 
-                            ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
-                }
-            }
-            if (source.equals("nurl")) {
-                Iterator eiter = sb.crawlQueues.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
-                plasmaCrawlEntry entry;
-                while (eiter.hasNext()) {
-                    try {
-                        entry = (plasmaCrawlEntry) eiter.next();
-                        if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth());
-                    } catch (Exception e) {
-                        // here a MalformedURLException may occur
-                        // just ignore
-                    }
-                    c++;
-                    if (c % 10000 == 0) System.out.println(
-                            c + " urls checked, " +
-                            doms.size() + " domains collected, " +
-                            ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + 
-                            ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
-                }
-            }
-            
-            if (format.equals("html")) {
-                // output file in HTML format
-                File file = new File(root, targetName + ".html");
-                BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
-                System.out.println("Started domain list dump to file " + file);
-                Iterator i = doms.entrySet().iterator();
-                Map.Entry entry;
-                String key;
-                bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes());
-                bos.write(serverCore.crlf);
-                bos.write(("<html><head><title>YaCy " + source + " domainlist</title></head><body>").getBytes());
-                bos.write(serverCore.crlf);
-                while (i.hasNext()) {
-                    entry = (Map.Entry) i.next();
-                    key = (String) entry.getKey();
-                    bos.write(("<a href=\"http://" + key + "\">" + key + "</a>" +
-                              ((entry.getValue() == null) ? "" : (" " + ((String) entry.getValue()))) + "<br>"
-                             ).getBytes());
-                    bos.write(serverCore.crlf);
-                }
-                bos.write(("</body></html>").getBytes());
-                bos.close();
-            
-            } else if (format.equals("zip")) {
-                // output file in plain text but compressed with ZIP
-                File file = new File(root, targetName + ".zip");
-                System.out.println("Started domain list dump to file " + file);
-                serverFileUtils.saveSet(file, "zip", doms.keySet(), new String(serverCore.crlf));
-            
-            } else if (format.equals("gzip")) {
-                // output file in plain text but compressed with GZIP
-                File file = new File(root, targetName + ".txt.gz");
-                System.out.println("Started domain list dump to file " + file);
-                serverFileUtils.saveSet(file, "gzip", doms.keySet(), new String(serverCore.crlf));
-            } else {
-                // plain text list
-                File file = new File(root, targetName + ".txt");
-                System.out.println("Started domain list dump to file " + file);
-                serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf));
-            }
-            sb.close();
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-    }
    
    private static String[] shift(String[] args, int pos, int count) {
        String[] newargs = new String[args.length - count];
@ -1082,27 +956,6 @@ public final class yacy {
            String targetaddress = args[1];
            String crfile = args[2];
            transferCR(targetaddress, crfile);
-        } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-domlist"))) {
-            // generate a url list and save it in a file
-            String source = "lurl";
-            if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
-                if ((args[2].equals("nurl")) ||
-                    (args[2].equals("lurl")) ||
-                    (args[2].equals("eurl")))
-                    source = args[2];
-                args = shift(args, 1, 2);
-            }
-            String format = "txt";
-            if (args.length >= 3 && args[1].toLowerCase().equals("-format")) {
-                if ((args[2].equals("html")) ||
-                    (args[2].equals("zip")) ||
-                    (args[2].equals("gzip")))
-                    format = args[2];
-                args = shift(args, 1, 2);
-            }
-            if (args.length == 2) applicationRoot= args[1];
-            String outfile = "domlist_" + source + "_" + System.currentTimeMillis();
-            domlist(applicationRoot, source, format, outfile);
        } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
            // generate a url list and save it in a file
            if (args.length == 2) applicationRoot= args[1];