diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index dfa781255..f67f02d1e 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -57,6 +57,7 @@ import de.anomic.kelondro.text.IndexCollection; import de.anomic.kelondro.text.MetadataRepository; import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.ReferenceRow; +import de.anomic.kelondro.text.MetadataRepository.Export; import de.anomic.kelondro.util.MemoryControl; import de.anomic.yacy.yacyURL; @@ -410,13 +411,10 @@ public class URLAnalysis { MetadataRepository mr = new MetadataRepository(new File(metadataPath)); HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, 100); System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff"); - byte[] refhash; - Iterator i = mr.iterator(); long start = System.currentTimeMillis(); long update = start - 7000; int c = 0; - while (i.hasNext()) { - refhash = i.next(); + for (byte[] refhash: mr) { if (idx.get(refhash) == -1) { // the key exists as urlhash in the URL database, but not in the collection as referenced urlhash hs.put(refhash); @@ -434,6 +432,33 @@ public class URLAnalysis { return c; } + public static void export(String metadataPath, int format, String export, String diffFile) throws IOException { + // format: 0=text, 1=html, 2=rss/xml + System.out.println("URL EXPORT startup"); + MetadataRepository mr = new MetadataRepository(new File(metadataPath)); + HandleSet hs = (diffFile == null) ? null : new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile)); + System.out.println("URL EXPORT loaded dump, starting export"); + Export e = mr.export(new File(export), ".*", hs, format, false); + try { + e.join(); + } catch (InterruptedException e1) { + e1.printStackTrace(); + } + System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries"); + } + + public static void delete(String metadataPath, String diffFile) throws IOException { + System.out.println("URL DELETE startup"); + MetadataRepository mr = new MetadataRepository(new File(metadataPath)); + int mrSize = mr.size(); + HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile)); + System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize); + for (byte[] refhash: hs) { + mr.remove(new String(refhash)); + } + System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database"); + } + public static void main(String[] args) { if (args[0].equals("-stat") && args.length >= 2) { // generate a statistics about common words in file, store to .stat @@ -460,13 +485,58 @@ public class URLAnalysis { } catch (IOException e) { e.printStackTrace(); } + } else if (args[0].equals("-export") && args.length >= 4) { + // export a url-list file + // example: + // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump + // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html' + int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0; + try { + export(args[1], format, args[3], (args.length >= 5) ? args[4] : null); + } catch (IOException e) { + e.printStackTrace(); + } + } else if (args[0].equals("-delete") && args.length >= 3) { + // delete from URLs as given by urlreference diff dump + // example: + // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump + // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html' + try { + delete(args[1], args[2]); + } catch (IOException e) { + e.printStackTrace(); + } } else { System.out.println("usage:"); - System.out.println("-stat generate a statistics about common words in file, store to .stat"); - System.out.println("-host generate a file .host containing only the hosts of the urls"); - System.out.println("-sort generate file .x.sort with sorted lists and split the file in smaller pieces"); - System.out.println("-incollection generate a dump of all referenced URL hashes"); - System.out.println("-diffurlcol find URLs that occur "); + System.out.println(); + System.out.println("-stat "); + System.out.println(" generate a statistics about common words in file, store to .stat"); + System.out.println(); + System.out.println("-host "); + System.out.println(" generate a file .host containing only the hosts of the urls"); + System.out.println(); + System.out.println("-sort "); + System.out.println(" generate file .x.sort with sorted lists and split the file in smaller pieces"); + System.out.println(); + System.out.println("-incollection "); + System.out.println(" generate a dump of all referenced URL hashes"); + System.out.println(); + System.out.println("-diffurlcol "); + System.out.println(" find URLs that occur in url-db but not in collections"); + System.out.println(); + System.out.println("-export "); + System.out.println(" export urls to file. the last argument can be omitted, then all urls are exported"); + System.out.println(); + System.out.println("-delete "); + System.out.println(" delete all urls that are listed in the diff-dump from the url-db"); + System.out.println(); + System.out.println("to do a complete clean-up of the url database, start the following:"); + System.out.println(); + System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump"); + System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump"); + System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump"); + System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump"); + System.out.println(); } } diff --git a/source/de/anomic/kelondro/text/MetadataRepository.java b/source/de/anomic/kelondro/text/MetadataRepository.java index cf2c26c9f..75b22c19a 100644 --- a/source/de/anomic/kelondro/text/MetadataRepository.java +++ b/source/de/anomic/kelondro/text/MetadataRepository.java @@ -44,6 +44,7 @@ import de.anomic.http.httpClient; import de.anomic.http.httpResponse; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.kelondro.blob.Cache; +import de.anomic.kelondro.index.HandleSet; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.ObjectIndex; import de.anomic.kelondro.order.CloneableIterator; @@ -406,12 +407,12 @@ public final class MetadataRepository implements Iterable { } // export methods - public Export export(final File f, final String filter, final int format, final boolean dom) { + public Export export(final File f, final String filter, HandleSet set, final int format, final boolean dom) { if ((exportthread != null) && (exportthread.isAlive())) { Log.logWarning("LURL-EXPORT", "cannot start another export thread, already one running"); return exportthread; } - this.exportthread = new Export(f, filter, format, dom); + this.exportthread = new Export(f, filter, set, format, dom); this.exportthread.start(); return exportthread; } @@ -427,8 +428,9 @@ public final class MetadataRepository implements Iterable { private String failure; private final int format; private final boolean dom; + private HandleSet set; - public Export(final File f, final String filter, final int format, boolean dom) { + public Export(final File f, final String filter, HandleSet set, final int format, boolean dom) { // format: 0=text, 1=html, 2=rss/xml this.f = f; this.filter = filter; @@ -436,12 +438,14 @@ public final class MetadataRepository implements Iterable { this.failure = null; this.format = format; this.dom = dom; + this.set = set; if ((dom) && (format == 2)) dom = false; } public void run() { try { - f.getParentFile().mkdirs(); + File parentf = f.getParentFile(); + if (parentf != null) parentf.mkdirs(); final PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(f))); if (format == 1) { pw.println(""); @@ -471,6 +475,7 @@ public final class MetadataRepository implements Iterable { String url; while (i.hasNext()) { entry = i.next(); + if (this.set != null && !set.has(entry.hash().getBytes())) continue; metadata = entry.metadata(); url = metadata.url().toNormalform(true, false); if (!url.matches(filter)) continue; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index f7f08ac2d..8f35f214f 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -397,7 +397,7 @@ public final class plasmaWordIndex implements Index { } public Export exportURL(final File f, final String filter, final int format, final boolean dom) { - return this.referenceURL.export(f, filter, format, dom); + return this.referenceURL.export(f, filter, null, format, dom); } public Export exportURL() {