From 60078cf322161f6339950a2ad440f0f6aa625507 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Tue, 10 Mar 2009 13:38:40 +0000
Subject: [PATCH] added next tool for url analysis: check for references, that
 occur in the URL-DB but not in the RICOLLECTIONS to use this, you must user
 the -incollection command before (see SVN 5687) and you need a used.dump file
 that has been produced with that process.

Now you can use that file, to do a URL-hash compare with the urls in the URL-DB. To do that, execute
java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump
or use different names for the dump files or more memory.

As a result, you get the file diffurlcol.dump which contains all the url hashes that occur in the URL database, but not in the collections.
The file has the format
{hash-12}*
that means: 12 byte long hashes are listed without any separation.

The next step could be to process this file and delete all these URLs with the computed hashes, or to export them before deletion.


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5692 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 source/de/anomic/data/URLAnalysis.java        |  63 ++++++-
 .../de/anomic/kelondro/index/HandleSet.java   | 158 ++++++++++++++++++
 .../kelondro/index/IntegerHandleIndex.java    |   4 +-
 .../kelondro/text/MetadataRepository.java     |  33 +++-
 source/de/anomic/plasma/plasmaWordIndex.java  |  29 ++--
 source/yacy.java                              |   6 +-
 6 files changed, 257 insertions(+), 36 deletions(-)
 create mode 100644 source/de/anomic/kelondro/index/HandleSet.java

diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java
index ee5697c61..dfa781255 100644
--- a/source/de/anomic/data/URLAnalysis.java
+++ b/source/de/anomic/data/URLAnalysis.java
@@ -50,9 +50,12 @@ import java.util.regex.Pattern;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 
+import de.anomic.kelondro.index.HandleSet;
 import de.anomic.kelondro.index.IntegerHandleIndex;
 import de.anomic.kelondro.order.Base64Order;
 import de.anomic.kelondro.text.IndexCollection;
+import de.anomic.kelondro.text.MetadataRepository;
+import de.anomic.kelondro.text.MetadataRowContainer;
 import de.anomic.kelondro.text.ReferenceRow;
 import de.anomic.kelondro.util.MemoryControl;
 import de.anomic.yacy.yacyURL;
@@ -395,29 +398,75 @@ public class URLAnalysis {
                 ReferenceRow.urlEntryRow);
             System.out.println("COLLECTION INDEX REFERENCE COLLECTION starting dump of statistics");
             idx.dump(new File(statisticPath));
-            System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump");
+            System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
         } catch (IOException e) {
             e.printStackTrace();
         }
     }
 
+    public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException {
+        System.out.println("COLLECTION INDEX DIFF URL-COL startup");
+        IntegerHandleIndex idx = new IntegerHandleIndex(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(statisticFile));
+        MetadataRepository mr = new MetadataRepository(new File(metadataPath));
+        HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, 100);
+        System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff");
+        byte[] refhash;
+        Iterator<byte[]> i = mr.iterator();
+        long start = System.currentTimeMillis();
+        long update = start - 7000;
+        int c = 0;
+        while (i.hasNext()) {
+            refhash = i.next();
+            if (idx.get(refhash) == -1) {
+                // the key exists as urlhash in the URL database, but not in the collection as referenced urlhash
+                hs.put(refhash);
+            }
+            c++;
+            if (System.currentTimeMillis() - update > 10000) {
+                System.out.println("COLLECTION INDEX DIFF URL-COL running, checked " + c + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - c) / c) / 60000) + " minutes remaining");
+                update = System.currentTimeMillis();
+            }
+        }
+        mr.close();
+        System.out.println("COLLECTION INDEX DIFF URL-COL finished diff, starting dump to " + diffFile);
+        c = hs.dump(new File(diffFile));
+        System.out.println("COLLECTION INDEX DIFF URL-COL finished dump, wrote " + c + " references that occur in the URL-DB, but not in the collection-dump");
+        return c;
+    }
+    
     public static void main(String[] args) {
-        // example: java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz 
-    	if (args[0].equals("-stat") && args.length >= 2) {
-    		for (int i = 1; i < args.length; i++) genstat(args[i]);
+        if (args[0].equals("-stat") && args.length >= 2) {
+            // generate a statistics about common words in file, store to <file>.stat
+            // example:
+            // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz 
+            for (int i = 1; i < args.length; i++) genstat(args[i]);
     	} else if (args[0].equals("-host") && args.length >= 2) {
+    	    // generate a file <file>.host containing only the hosts of the urls
     		for (int i = 1; i < args.length; i++) genhost(args[i]);
         } else if (args[0].equals("-sort") && args.length >= 2) {
+            // generate file <file>.x.sort with sorted lists and split the file in smaller pieces
             for (int i = 1; i < args.length; i++) sortsplit(args[i]);
         } else if (args[0].equals("-incollection") && args.length >= 2) {
+            // generate a dump of all referenced URL hashes from a given RICOLLECTION
             // example:
             // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump
             incollection(args[1], args[2]);
+        } else if (args[0].equals("-diffurlcol") && args.length >= 3) {
+            // make a diff-file that contains hashes from the url database that do not occur in the collection reference dump
+            // example:
+            // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump
+            try {
+                diffurlcol(args[1], args[2], args[3]);
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
         } else {
     		System.out.println("usage:");
-    		System.out.println("-stat <file>    generate a statistics about common words in file, store to <file>.stat");
-    		System.out.println("-host <file>    generate a file <file>.host containing only the hosts of the urls");
-    		System.out.println("-sort <file>    generate file <file>.x.sort with sorted lists and split the file in smaller pieces");
+    		System.out.println("-stat         <file>    generate a statistics about common words in file, store to <file>.stat");
+    		System.out.println("-host         <file>    generate a file <file>.host containing only the hosts of the urls");
+    		System.out.println("-sort         <file>    generate file <file>.x.sort with sorted lists and split the file in smaller pieces");
+    		System.out.println("-incollection <path-to-RICOLLECTION> <file>  generate a dump of all referenced URL hashes");
+    		System.out.println("-diffurlcol   <path-to-URL-DB> <dump-from-incollection> <diff-dump>  find URLs that occur ");
     	}
     }
     
diff --git a/source/de/anomic/kelondro/index/HandleSet.java b/source/de/anomic/kelondro/index/HandleSet.java
new file mode 100644
index 000000000..3bcf74676
--- /dev/null
+++ b/source/de/anomic/kelondro/index/HandleSet.java
@@ -0,0 +1,158 @@
+// HandleSet.java
+// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 10.03.2009 on http://www.anomic.de
+//
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package de.anomic.kelondro.index;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Iterator;
+
+import de.anomic.kelondro.order.ByteOrder;
+import de.anomic.kelondro.order.CloneableIterator;
+
+public class HandleSet implements Iterable<byte[]> {
+    
+    private final Row rowdef;
+    private ObjectIndex index;
+    
+    public HandleSet(final int keylength, final ByteOrder objectOrder, final int space) {
+        this.rowdef = new Row(new Column[]{new Column("key", Column.celltype_binary, Column.encoder_bytes, keylength, "key")}, objectOrder, 0);
+        this.index = new ObjectIndexCache(rowdef, space);
+    }
+
+    /**
+     * initialize a HandleSet with the content of a dump
+     * @param keylength
+     * @param objectOrder
+     * @param file
+     * @throws IOException 
+     */
+    public HandleSet(final int keylength, final ByteOrder objectOrder, final File file) throws IOException {
+        this(keylength, objectOrder, (int) (file.length() / (keylength + 8)));
+        // read the index dump and fill the index
+        InputStream is = new BufferedInputStream(new FileInputStream(file), 1024 * 1024);
+        byte[] a = new byte[keylength];
+        int c;
+        while (true) {
+            c = is.read(a);
+            if (c <= 0) break;
+            this.index.addUnique(this.rowdef.newEntry(a));
+        }
+        is.close();
+        assert this.index.size() == file.length() / keylength;
+    }
+
+    /**
+     * write a dump of the set to a file. All entries are written in order
+     * which makes it possible to read them again in a fast way
+     * @param file
+     * @return the number of written entries
+     * @throws IOException
+     */
+    public int dump(File file) throws IOException {
+        // we must use an iterator from the combined index, because we need the entries sorted
+        // otherwise we could just write the byte[] from the in kelondroRowSet which would make
+        // everything much faster, but this is not an option here.
+        Iterator<Row.Entry> i = this.index.rows(true, null);
+        OutputStream os = new BufferedOutputStream(new FileOutputStream(file), 1024 * 1024);
+        int c = 0;
+        while (i.hasNext()) {
+            os.write(i.next().bytes());
+            c++;
+        }
+        os.flush();
+        os.close();
+        return c;
+    }
+    
+    public Row row() {
+        return index.row();
+    }
+    
+    public void clear() throws IOException {
+        this.index.clear();
+    }
+    
+    public synchronized boolean has(final byte[] key) {
+        assert (key != null);
+        return index.has(key);
+    }
+    
+    public synchronized int put(final byte[] key) throws IOException {
+        assert (key != null);
+        final Row.Entry newentry = index.row().newEntry();
+        newentry.setCol(0, key);
+        final Row.Entry oldentry = index.put(newentry);
+        if (oldentry == null) return -1;
+        return (int) oldentry.getColLong(1);
+    }
+    
+    public synchronized void putUnique(final byte[] key) throws IOException {
+        assert (key != null);
+        final Row.Entry newentry = this.rowdef.newEntry();
+        newentry.setCol(0, key);
+        index.addUnique(newentry);
+    }
+    
+    public synchronized int remove(final byte[] key) throws IOException {
+        assert (key != null);
+        final Row.Entry indexentry = index.remove(key);
+        if (indexentry == null) return -1;
+        return (int) indexentry.getColLong(1);
+    }
+
+    public synchronized int removeone() throws IOException {
+        final Row.Entry indexentry = index.removeOne();
+        if (indexentry == null) return -1;
+        return (int) indexentry.getColLong(1);
+    }
+    
+    public synchronized int size() {
+        return index.size();
+    }
+    
+    public synchronized CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {
+        try {
+            return index.keys(up, firstKey);
+        } catch (IOException e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
+    
+    public Iterator<byte[]> iterator() {
+        return keys(true, null);
+    }
+    
+    public synchronized void close() {
+        index.close();
+        index = null;
+    }
+}
diff --git a/source/de/anomic/kelondro/index/IntegerHandleIndex.java b/source/de/anomic/kelondro/index/IntegerHandleIndex.java
index f02e1db47..76c576f2d 100644
--- a/source/de/anomic/kelondro/index/IntegerHandleIndex.java
+++ b/source/de/anomic/kelondro/index/IntegerHandleIndex.java
@@ -66,7 +66,7 @@ public class IntegerHandleIndex {
         this(keylength, objectOrder, (int) (file.length() / (keylength + 8)));
         // read the index dump and fill the index
         InputStream is = new BufferedInputStream(new FileInputStream(file), 1024 * 1024);
-        byte[] a = new byte[keylength + 8];
+        byte[] a = new byte[keylength + 4];
         int c;
         while (true) {
             c = is.read(a);
@@ -74,7 +74,7 @@ public class IntegerHandleIndex {
             this.index.addUnique(this.rowdef.newEntry(a));
         }
         is.close();
-        assert this.index.size() == file.length() / (keylength + 8);
+        assert this.index.size() == file.length() / (keylength + 4);
     }
 
     /**
diff --git a/source/de/anomic/kelondro/text/MetadataRepository.java b/source/de/anomic/kelondro/text/MetadataRepository.java
index 6a80c0674..cf2c26c9f 100644
--- a/source/de/anomic/kelondro/text/MetadataRepository.java
+++ b/source/de/anomic/kelondro/text/MetadataRepository.java
@@ -52,18 +52,20 @@ import de.anomic.kelondro.util.ScoreCluster;
 import de.anomic.kelondro.util.Log;
 import de.anomic.yacy.yacyURL;
 
-public final class MetadataRepository {
+public final class MetadataRepository implements Iterable<byte[]> {
 
     // class objects
-    ObjectIndex urlIndexFile;
-    private Export      exportthread    = null; // will have a export thread assigned if exporter is running
-    private File        location        = null;
-    ArrayList<hostStat> statsDump       = null;
+    private ObjectIndex         urlIndexFile;
+    private Export              exportthread; // will have a export thread assigned if exporter is running
+    private File                location;
+    private ArrayList<hostStat> statsDump;
     
-    public MetadataRepository(final File indexSecondaryPath) {
-        super();
-        this.location = new File(indexSecondaryPath, "TEXT");        
-        urlIndexFile = new Cache(new SplitTable(this.location, "urls", MetadataRowContainer.rowdef, false));
+    public MetadataRepository(final File path) {
+        this.location = path;        
+        this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", MetadataRowContainer.rowdef, false));
+        this.exportthread = null; // will have a export thread assigned if exporter is running
+        this.statsDump = null;
+       
     }
 
     public void clearCache() {
@@ -151,6 +153,19 @@ public final class MetadataRepository {
         return urlIndexFile.has(urlHash.getBytes());
     }
 
+    public CloneableIterator<byte[]> keys(boolean up, byte[] firstKey) {
+        try {
+            return this.urlIndexFile.keys(up, firstKey);
+        } catch (IOException e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
+
+    public Iterator<byte[]> iterator() {
+        return keys(true, null);
+    }
+    
     public CloneableIterator<MetadataRowContainer> entries() throws IOException {
         // enumerates entry elements
         return new kiter();
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 7ffb01918..f7f08ac2d 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -45,7 +45,6 @@ import de.anomic.kelondro.index.RowCollection;
 import de.anomic.kelondro.order.Base64Order;
 import de.anomic.kelondro.order.ByteOrder;
 import de.anomic.kelondro.order.CloneableIterator;
-import de.anomic.kelondro.order.MergeIterator;
 import de.anomic.kelondro.order.Order;
 import de.anomic.kelondro.order.RotateIterator;
 import de.anomic.kelondro.text.Index;
@@ -96,19 +95,19 @@ public final class plasmaWordIndex implements Index {
     public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
     
     
-    private final IndexCache               indexCache;
-    private final IndexCollection        collections;          // new database structure to replace AssortmentCluster and FileCluster
-    private final Log                      log;
-    public MetadataRepository        referenceURL;
-    public  final yacySeedDB               seedDB;
-    private final File                     primaryRoot, secondaryRoot;
-    public        IndexingStack            queuePreStack;
-    public        CrawlProfile             profilesActiveCrawls, profilesPassiveCrawls;
-    public  CrawlProfile.entry             defaultProxyProfile;
-    public  CrawlProfile.entry             defaultRemoteProfile;
-    public  CrawlProfile.entry             defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
-    public  CrawlProfile.entry             defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
-    private final File                     queuesRoot;
+    private final IndexCache      indexCache;
+    private final IndexCollection collections;          // new database structure to replace AssortmentCluster and FileCluster
+    private final Log             log;
+    public MetadataRepository     referenceURL;
+    public  final yacySeedDB      seedDB;
+    private final File            primaryRoot, secondaryRoot;
+    public        IndexingStack   queuePreStack;
+    public        CrawlProfile    profilesActiveCrawls, profilesPassiveCrawls;
+    public  CrawlProfile.entry    defaultProxyProfile;
+    public  CrawlProfile.entry    defaultRemoteProfile;
+    public  CrawlProfile.entry    defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
+    public  CrawlProfile.entry    defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
+    private final File            queuesRoot;
 
     public plasmaWordIndex(
             final String networkName,
@@ -171,7 +170,7 @@ public final class plasmaWordIndex implements Index {
 					useCommons);
 
         // create LURL-db
-        referenceURL = new MetadataRepository(this.secondaryRoot);
+        referenceURL = new MetadataRepository(new File(this.secondaryRoot, "TEXT"));
         
         // make crawl profiles database and default profiles
         this.queuesRoot = new File(this.primaryRoot, "QUEUES");
diff --git a/source/yacy.java b/source/yacy.java
index 6dc8aeaa9..6c327f4b2 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -667,10 +667,10 @@ public final class yacy {
             log.logInfo("STARTING URL CLEANUP");
             
             // db containing all currently loades urls
-            final MetadataRepository currentUrlDB = new MetadataRepository(new File(indexSecondaryRoot, networkName));
+            final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexSecondaryRoot, networkName), "TEXT"));
             
             // db used to hold all neede urls
-            final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(indexRoot2, networkName));
+            final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"));
             
             final int cacheMem = (int)(MemoryControl.max() - MemoryControl.total());
             if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
@@ -849,7 +849,7 @@ public final class yacy {
         final File root = homePath;
         final File indexroot = new File(root, "DATA/INDEX");
         try {Log.configureLogging(homePath, new File(homePath, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
-        final MetadataRepository currentUrlDB = new MetadataRepository(new File(indexroot, networkName));
+        final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"));
         currentUrlDB.deadlinkCleaner(null);
         currentUrlDB.close();
     }