diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index ee5697c61..dfa781255 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -50,9 +50,12 @@ import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; +import de.anomic.kelondro.index.HandleSet; import de.anomic.kelondro.index.IntegerHandleIndex; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.text.IndexCollection; +import de.anomic.kelondro.text.MetadataRepository; +import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.ReferenceRow; import de.anomic.kelondro.util.MemoryControl; import de.anomic.yacy.yacyURL; @@ -395,29 +398,75 @@ public class URLAnalysis { ReferenceRow.urlEntryRow); System.out.println("COLLECTION INDEX REFERENCE COLLECTION starting dump of statistics"); idx.dump(new File(statisticPath)); - System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump"); + System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath); } catch (IOException e) { e.printStackTrace(); } } + public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException { + System.out.println("COLLECTION INDEX DIFF URL-COL startup"); + IntegerHandleIndex idx = new IntegerHandleIndex(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(statisticFile)); + MetadataRepository mr = new MetadataRepository(new File(metadataPath)); + HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, 100); + System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff"); + byte[] refhash; + Iterator i = mr.iterator(); + long start = System.currentTimeMillis(); + long update = start - 7000; + int c = 0; + while (i.hasNext()) { + refhash = i.next(); + if (idx.get(refhash) == -1) { + // the key exists as urlhash in the URL database, but not in the collection as referenced urlhash + hs.put(refhash); + } + c++; + if (System.currentTimeMillis() - update > 10000) { + System.out.println("COLLECTION INDEX DIFF URL-COL running, checked " + c + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - c) / c) / 60000) + " minutes remaining"); + update = System.currentTimeMillis(); + } + } + mr.close(); + System.out.println("COLLECTION INDEX DIFF URL-COL finished diff, starting dump to " + diffFile); + c = hs.dump(new File(diffFile)); + System.out.println("COLLECTION INDEX DIFF URL-COL finished dump, wrote " + c + " references that occur in the URL-DB, but not in the collection-dump"); + return c; + } + public static void main(String[] args) { - // example: java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz - if (args[0].equals("-stat") && args.length >= 2) { - for (int i = 1; i < args.length; i++) genstat(args[i]); + if (args[0].equals("-stat") && args.length >= 2) { + // generate a statistics about common words in file, store to .stat + // example: + // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz + for (int i = 1; i < args.length; i++) genstat(args[i]); } else if (args[0].equals("-host") && args.length >= 2) { + // generate a file .host containing only the hosts of the urls for (int i = 1; i < args.length; i++) genhost(args[i]); } else if (args[0].equals("-sort") && args.length >= 2) { + // generate file .x.sort with sorted lists and split the file in smaller pieces for (int i = 1; i < args.length; i++) sortsplit(args[i]); } else if (args[0].equals("-incollection") && args.length >= 2) { + // generate a dump of all referenced URL hashes from a given RICOLLECTION // example: // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump incollection(args[1], args[2]); + } else if (args[0].equals("-diffurlcol") && args.length >= 3) { + // make a diff-file that contains hashes from the url database that do not occur in the collection reference dump + // example: + // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump + try { + diffurlcol(args[1], args[2], args[3]); + } catch (IOException e) { + e.printStackTrace(); + } } else { System.out.println("usage:"); - System.out.println("-stat generate a statistics about common words in file, store to .stat"); - System.out.println("-host generate a file .host containing only the hosts of the urls"); - System.out.println("-sort generate file .x.sort with sorted lists and split the file in smaller pieces"); + System.out.println("-stat generate a statistics about common words in file, store to .stat"); + System.out.println("-host generate a file .host containing only the hosts of the urls"); + System.out.println("-sort generate file .x.sort with sorted lists and split the file in smaller pieces"); + System.out.println("-incollection generate a dump of all referenced URL hashes"); + System.out.println("-diffurlcol find URLs that occur "); } } diff --git a/source/de/anomic/kelondro/index/HandleSet.java b/source/de/anomic/kelondro/index/HandleSet.java new file mode 100644 index 000000000..3bcf74676 --- /dev/null +++ b/source/de/anomic/kelondro/index/HandleSet.java @@ -0,0 +1,158 @@ +// HandleSet.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 10.03.2009 on http://www.anomic.de +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro.index; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Iterator; + +import de.anomic.kelondro.order.ByteOrder; +import de.anomic.kelondro.order.CloneableIterator; + +public class HandleSet implements Iterable { + + private final Row rowdef; + private ObjectIndex index; + + public HandleSet(final int keylength, final ByteOrder objectOrder, final int space) { + this.rowdef = new Row(new Column[]{new Column("key", Column.celltype_binary, Column.encoder_bytes, keylength, "key")}, objectOrder, 0); + this.index = new ObjectIndexCache(rowdef, space); + } + + /** + * initialize a HandleSet with the content of a dump + * @param keylength + * @param objectOrder + * @param file + * @throws IOException + */ + public HandleSet(final int keylength, final ByteOrder objectOrder, final File file) throws IOException { + this(keylength, objectOrder, (int) (file.length() / (keylength + 8))); + // read the index dump and fill the index + InputStream is = new BufferedInputStream(new FileInputStream(file), 1024 * 1024); + byte[] a = new byte[keylength]; + int c; + while (true) { + c = is.read(a); + if (c <= 0) break; + this.index.addUnique(this.rowdef.newEntry(a)); + } + is.close(); + assert this.index.size() == file.length() / keylength; + } + + /** + * write a dump of the set to a file. All entries are written in order + * which makes it possible to read them again in a fast way + * @param file + * @return the number of written entries + * @throws IOException + */ + public int dump(File file) throws IOException { + // we must use an iterator from the combined index, because we need the entries sorted + // otherwise we could just write the byte[] from the in kelondroRowSet which would make + // everything much faster, but this is not an option here. + Iterator i = this.index.rows(true, null); + OutputStream os = new BufferedOutputStream(new FileOutputStream(file), 1024 * 1024); + int c = 0; + while (i.hasNext()) { + os.write(i.next().bytes()); + c++; + } + os.flush(); + os.close(); + return c; + } + + public Row row() { + return index.row(); + } + + public void clear() throws IOException { + this.index.clear(); + } + + public synchronized boolean has(final byte[] key) { + assert (key != null); + return index.has(key); + } + + public synchronized int put(final byte[] key) throws IOException { + assert (key != null); + final Row.Entry newentry = index.row().newEntry(); + newentry.setCol(0, key); + final Row.Entry oldentry = index.put(newentry); + if (oldentry == null) return -1; + return (int) oldentry.getColLong(1); + } + + public synchronized void putUnique(final byte[] key) throws IOException { + assert (key != null); + final Row.Entry newentry = this.rowdef.newEntry(); + newentry.setCol(0, key); + index.addUnique(newentry); + } + + public synchronized int remove(final byte[] key) throws IOException { + assert (key != null); + final Row.Entry indexentry = index.remove(key); + if (indexentry == null) return -1; + return (int) indexentry.getColLong(1); + } + + public synchronized int removeone() throws IOException { + final Row.Entry indexentry = index.removeOne(); + if (indexentry == null) return -1; + return (int) indexentry.getColLong(1); + } + + public synchronized int size() { + return index.size(); + } + + public synchronized CloneableIterator keys(final boolean up, final byte[] firstKey) { + try { + return index.keys(up, firstKey); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + public Iterator iterator() { + return keys(true, null); + } + + public synchronized void close() { + index.close(); + index = null; + } +} diff --git a/source/de/anomic/kelondro/index/IntegerHandleIndex.java b/source/de/anomic/kelondro/index/IntegerHandleIndex.java index f02e1db47..76c576f2d 100644 --- a/source/de/anomic/kelondro/index/IntegerHandleIndex.java +++ b/source/de/anomic/kelondro/index/IntegerHandleIndex.java @@ -66,7 +66,7 @@ public class IntegerHandleIndex { this(keylength, objectOrder, (int) (file.length() / (keylength + 8))); // read the index dump and fill the index InputStream is = new BufferedInputStream(new FileInputStream(file), 1024 * 1024); - byte[] a = new byte[keylength + 8]; + byte[] a = new byte[keylength + 4]; int c; while (true) { c = is.read(a); @@ -74,7 +74,7 @@ public class IntegerHandleIndex { this.index.addUnique(this.rowdef.newEntry(a)); } is.close(); - assert this.index.size() == file.length() / (keylength + 8); + assert this.index.size() == file.length() / (keylength + 4); } /** diff --git a/source/de/anomic/kelondro/text/MetadataRepository.java b/source/de/anomic/kelondro/text/MetadataRepository.java index 6a80c0674..cf2c26c9f 100644 --- a/source/de/anomic/kelondro/text/MetadataRepository.java +++ b/source/de/anomic/kelondro/text/MetadataRepository.java @@ -52,18 +52,20 @@ import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.Log; import de.anomic.yacy.yacyURL; -public final class MetadataRepository { +public final class MetadataRepository implements Iterable { // class objects - ObjectIndex urlIndexFile; - private Export exportthread = null; // will have a export thread assigned if exporter is running - private File location = null; - ArrayList statsDump = null; + private ObjectIndex urlIndexFile; + private Export exportthread; // will have a export thread assigned if exporter is running + private File location; + private ArrayList statsDump; - public MetadataRepository(final File indexSecondaryPath) { - super(); - this.location = new File(indexSecondaryPath, "TEXT"); - urlIndexFile = new Cache(new SplitTable(this.location, "urls", MetadataRowContainer.rowdef, false)); + public MetadataRepository(final File path) { + this.location = path; + this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", MetadataRowContainer.rowdef, false)); + this.exportthread = null; // will have a export thread assigned if exporter is running + this.statsDump = null; + } public void clearCache() { @@ -151,6 +153,19 @@ public final class MetadataRepository { return urlIndexFile.has(urlHash.getBytes()); } + public CloneableIterator keys(boolean up, byte[] firstKey) { + try { + return this.urlIndexFile.keys(up, firstKey); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + public Iterator iterator() { + return keys(true, null); + } + public CloneableIterator entries() throws IOException { // enumerates entry elements return new kiter(); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 7ffb01918..f7f08ac2d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -45,7 +45,6 @@ import de.anomic.kelondro.index.RowCollection; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.CloneableIterator; -import de.anomic.kelondro.order.MergeIterator; import de.anomic.kelondro.order.Order; import de.anomic.kelondro.order.RotateIterator; import de.anomic.kelondro.text.Index; @@ -96,19 +95,19 @@ public final class plasmaWordIndex implements Index { public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; - private final IndexCache indexCache; - private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster - private final Log log; - public MetadataRepository referenceURL; - public final yacySeedDB seedDB; - private final File primaryRoot, secondaryRoot; - public IndexingStack queuePreStack; - public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls; - public CrawlProfile.entry defaultProxyProfile; - public CrawlProfile.entry defaultRemoteProfile; - public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; - public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; - private final File queuesRoot; + private final IndexCache indexCache; + private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster + private final Log log; + public MetadataRepository referenceURL; + public final yacySeedDB seedDB; + private final File primaryRoot, secondaryRoot; + public IndexingStack queuePreStack; + public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls; + public CrawlProfile.entry defaultProxyProfile; + public CrawlProfile.entry defaultRemoteProfile; + public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; + public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; + private final File queuesRoot; public plasmaWordIndex( final String networkName, @@ -171,7 +170,7 @@ public final class plasmaWordIndex implements Index { useCommons); // create LURL-db - referenceURL = new MetadataRepository(this.secondaryRoot); + referenceURL = new MetadataRepository(new File(this.secondaryRoot, "TEXT")); // make crawl profiles database and default profiles this.queuesRoot = new File(this.primaryRoot, "QUEUES"); diff --git a/source/yacy.java b/source/yacy.java index 6dc8aeaa9..6c327f4b2 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -667,10 +667,10 @@ public final class yacy { log.logInfo("STARTING URL CLEANUP"); // db containing all currently loades urls - final MetadataRepository currentUrlDB = new MetadataRepository(new File(indexSecondaryRoot, networkName)); + final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexSecondaryRoot, networkName), "TEXT")); // db used to hold all neede urls - final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(indexRoot2, networkName)); + final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT")); final int cacheMem = (int)(MemoryControl.max() - MemoryControl.total()); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); @@ -849,7 +849,7 @@ public final class yacy { final File root = homePath; final File indexroot = new File(root, "DATA/INDEX"); try {Log.configureLogging(homePath, new File(homePath, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - final MetadataRepository currentUrlDB = new MetadataRepository(new File(indexroot, networkName)); + final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT")); currentUrlDB.deadlinkCleaner(null); currentUrlDB.close(); }