added also an export and delete-feature to the URLAnalysis. This completes the clean-up feature for URLs. To do a complete clean-up of the url database, start the following:

java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump The export-feature is optional, the purpose of that function is to provide a back-up function for URLs to be deleted. The export function can also be used to create html files with embedded links and simple text-files. Simply replace the 'xml' word with 'html' or 'text'. The last argument in the cann, the diffurlcol.dump value, can also be omitted. This will cause that the complete URL database is exported. This is an alternative to the Web-Interface based export function. The delete-feature is the only destructive method of the four presented here. Please use it with care. It is better to make a back-up of the url database files before starting the deletion. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5694 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 100247bdda
parent 8c60d6d117
commit 100247bdda
3 changed files with 89 additions and 14 deletions
--- a/source/de/anomic/data/URLAnalysis.java
+++ b/source/de/anomic/data/URLAnalysis.java
@ -57,6 +57,7 @@ import de.anomic.kelondro.text.IndexCollection;
 import de.anomic.kelondro.text.MetadataRepository;
 import de.anomic.kelondro.text.MetadataRowContainer;
 import de.anomic.kelondro.text.ReferenceRow;
 import de.anomic.kelondro.text.MetadataRepository.Export;
 import de.anomic.kelondro.util.MemoryControl;
 import de.anomic.yacy.yacyURL;
@ -410,13 +411,10 @@ public class URLAnalysis {
        MetadataRepository mr = new MetadataRepository(new File(metadataPath));
        HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, 100);
        System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff");
        byte[] refhash;
        Iterator<byte[]> i = mr.iterator();
        long start = System.currentTimeMillis();
        long update = start - 7000;
        int c = 0;
-        while (i.hasNext()) {
+        for (byte[] refhash: mr) {
            refhash = i.next();
            if (idx.get(refhash) == -1) {
                // the key exists as urlhash in the URL database, but not in the collection as referenced urlhash
                hs.put(refhash);
@ -434,6 +432,33 @@ public class URLAnalysis {
        return c;
    }
    public static void export(String metadataPath, int format, String export, String diffFile) throws IOException {
        // format: 0=text, 1=html, 2=rss/xml
        System.out.println("URL EXPORT startup");
        MetadataRepository mr = new MetadataRepository(new File(metadataPath));
        HandleSet hs = (diffFile == null) ? null : new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile));
        System.out.println("URL EXPORT loaded dump, starting export");
        Export e = mr.export(new File(export), ".*", hs, format, false);
        try {
            e.join();
        } catch (InterruptedException e1) {
            e1.printStackTrace();
        }
        System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries");
    }
    public static void delete(String metadataPath, String diffFile) throws IOException {
        System.out.println("URL DELETE startup");
        MetadataRepository mr = new MetadataRepository(new File(metadataPath));
        int mrSize = mr.size();
        HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile));
        System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
        for (byte[] refhash: hs) {
            mr.remove(new String(refhash));
        }
        System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database");
    }
    public static void main(String[] args) {
        if (args[0].equals("-stat") && args.length >= 2) {
            // generate a statistics about common words in file, store to <file>.stat
@ -460,13 +485,58 @@ public class URLAnalysis {
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else if (args[0].equals("-export") && args.length >= 4) {
            // export a url-list file
            // example:
            // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump
            // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
            int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0;
            try {
                export(args[1], format, args[3], (args.length >= 5) ? args[4] : null);
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else if (args[0].equals("-delete") && args.length >= 3) {
            // delete from URLs as given by urlreference diff dump
            // example:
            // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump
            // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
            try {
                delete(args[1], args[2]);
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
    		System.out.println("usage:");
-    		System.out.println("-stat         <file>    generate a statistics about common words in file, store to <file>.stat");
+    		System.out.println();
-    		System.out.println("-host         <file>    generate a file <file>.host containing only the hosts of the urls");
+    		System.out.println("-stat <file> ");
-    		System.out.println("-sort         <file>    generate file <file>.x.sort with sorted lists and split the file in smaller pieces");
+    		System.out.println(" generate a statistics about common words in file, store to <file>.stat");
-    		System.out.println("-incollection <path-to-RICOLLECTION> <file>  generate a dump of all referenced URL hashes");
+            System.out.println();
-    		System.out.println("-diffurlcol   <path-to-URL-DB> <dump-from-incollection> <diff-dump>  find URLs that occur ");
+            System.out.println("-host <file>");
            System.out.println(" generate a file <file>.host containing only the hosts of the urls");
            System.out.println();
            System.out.println("-sort <file>");
            System.out.println(" generate file <file>.x.sort with sorted lists and split the file in smaller pieces");
            System.out.println();
            System.out.println("-incollection <path-to-RICOLLECTION> <file>");
            System.out.println(" generate a dump of all referenced URL hashes");
            System.out.println();
            System.out.println("-diffurlcol <path-to-URL-DB> <dump-from-incollection> <diff-dump>");
            System.out.println(" find URLs that occur in url-db but not in collections");
            System.out.println();
            System.out.println("-export <path-to-URL-DB> <format text|html|xml> <export-file> <diff-dump>");
            System.out.println(" export urls to file. the last argument can be omitted, then all urls are exported");
            System.out.println();
            System.out.println("-delete <path-to-URL-DB> <diff-dump>");
            System.out.println(" delete all urls that are listed in the diff-dump from the url-db");
            System.out.println();
            System.out.println("to do a complete clean-up of the url database, start the following:");
            System.out.println();
            System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump");
            System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump");
            System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump");
            System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump");
            System.out.println();
    	}
    }
--- a/source/de/anomic/kelondro/text/MetadataRepository.java
+++ b/source/de/anomic/kelondro/text/MetadataRepository.java
@ -44,6 +44,7 @@ import de.anomic.http.httpClient;
 import de.anomic.http.httpResponse;
 import de.anomic.http.httpRemoteProxyConfig;
 import de.anomic.kelondro.blob.Cache;
 import de.anomic.kelondro.index.HandleSet;
 import de.anomic.kelondro.index.Row;
 import de.anomic.kelondro.index.ObjectIndex;
 import de.anomic.kelondro.order.CloneableIterator;
@ -406,12 +407,12 @@ public final class MetadataRepository implements Iterable<byte[]> {
    }
    // export methods
-    public Export export(final File f, final String filter, final int format, final boolean dom) {
+    public Export export(final File f, final String filter, HandleSet set, final int format, final boolean dom) {
        if ((exportthread != null) && (exportthread.isAlive())) {
            Log.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
            return exportthread;
        }
-        this.exportthread = new Export(f, filter, format, dom);
+        this.exportthread = new Export(f, filter, set, format, dom);
        this.exportthread.start();
        return exportthread;
    }
@ -427,8 +428,9 @@ public final class MetadataRepository implements Iterable<byte[]> {
        private String failure;
        private final int format;
        private final boolean dom;
        private HandleSet set;
-        public Export(final File f, final String filter, final int format, boolean dom) {
+        public Export(final File f, final String filter, HandleSet set, final int format, boolean dom) {
            // format: 0=text, 1=html, 2=rss/xml
            this.f = f;
            this.filter = filter;
@ -436,12 +438,14 @@ public final class MetadataRepository implements Iterable<byte[]> {
            this.failure = null;
            this.format = format;
            this.dom = dom;
            this.set = set;
            if ((dom) && (format == 2)) dom = false;
        }
        public void run() {
            try {
-                f.getParentFile().mkdirs();
+                File parentf = f.getParentFile();
                if (parentf != null) parentf.mkdirs();
                final PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(f)));
                if (format == 1) {
                    pw.println("<html><head></head><body>");
@ -471,6 +475,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
                    String url;
                    while (i.hasNext()) {
                        entry = i.next();
                        if (this.set != null && !set.has(entry.hash().getBytes())) continue;
                        metadata = entry.metadata();
                        url = metadata.url().toNormalform(true, false);
                        if (!url.matches(filter)) continue;
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -397,7 +397,7 @@ public final class plasmaWordIndex implements Index {
    }
    public Export exportURL(final File f, final String filter, final int format, final boolean dom) {
-        return this.referenceURL.export(f, filter, format, dom);
+        return this.referenceURL.export(f, filter, null, format, dom);
    }
    public Export exportURL() {