diff --git a/htroot/IndexCleaner_p.java b/htroot/IndexCleaner_p.java deleted file mode 100644 index c30ea0e7c..000000000 --- a/htroot/IndexCleaner_p.java +++ /dev/null @@ -1,108 +0,0 @@ -//----------------------- -//part of the AnomicHTTPD caching proxy -//(C) by Michael Peter Christen; mc@yacy.net -//first published on http://www.anomic.de -//Frankfurt, Germany, 2005 -// -//This file is contributed by Matthias Soehnholz -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -//This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by -//the Free Software Foundation; either version 2 of the License, or -//(at your option) any later version. -// -//This program is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. -// -//You should have received a copy of the GNU General Public License -//along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.search.Switchboard; -import net.yacy.search.index.MetadataRepository; -import net.yacy.search.index.Segment; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class IndexCleaner_p { - private static MetadataRepository.BlacklistCleaner urldbCleanerThread = null; - private static Segment.ReferenceCleaner indexCleanerThread = null; - - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { - final serverObjects prop = new serverObjects(); - final Switchboard sb = (Switchboard) env; - prop.put("title", "DbCleanup_p"); - - // get segment - Segment indexSegment = sb.index; - - if (post!=null) { - if (post.get("action").equals("ustart")) { - if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) { - urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist, sb.crawlStacker); - urldbCleanerThread.start(); - } - else { - urldbCleanerThread.endPause(); - } - } - else if (post.get("action").equals("ustop") && (urldbCleanerThread!=null)) { - urldbCleanerThread.abort(); - } - else if (post.get("action").equals("upause") && (urldbCleanerThread!=null)) { - urldbCleanerThread.pause(); - } - else if (post.get("action").equals("rstart")) { - if (indexCleanerThread==null || !indexCleanerThread.isAlive()) { - indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes()); - indexCleanerThread.start(); - } - else { - indexCleanerThread.endPause(); - } - } - else if (post.get("action").equals("rstop") && (indexCleanerThread!=null)) { - indexCleanerThread.abort(); - } - else if (post.get("action").equals("rpause") && (indexCleanerThread!=null)) { - indexCleanerThread.pause(); - } - prop.put("LOCATION",""); - return prop; - } - if (urldbCleanerThread!=null) { - prop.put("urldb", "1"); - prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100); - prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls); - prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls); - prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl); - prop.put("urldb_lastBlacklistedHash", urldbCleanerThread.lastBlacklistedHash); - prop.putHTML("urldb_lastUrl", urldbCleanerThread.lastUrl); - prop.put("urldb_lastHash", urldbCleanerThread.lastHash); - prop.put("urldb_threadAlive", Boolean.toString(urldbCleanerThread.isAlive())); - prop.put("urldb_threadToString", urldbCleanerThread.toString()); - final double percent = ((double)urldbCleanerThread.blacklistedUrls/urldbCleanerThread.totalSearchedUrls)*100; - prop.putNum("urldb_percent", percent); - } - if (indexCleanerThread!=null) { - prop.put("rwidb", "1"); - prop.put("rwidb_threadAlive", Boolean.toString(indexCleanerThread.isAlive())); - prop.put("rwidb_threadToString", indexCleanerThread.toString()); - prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart); - prop.putNum("rwidb_RWIcountnow", indexCleanerThread.rwisize()); - prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : ASCII.String(indexCleanerThread.wordHashNow)); - prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : ASCII.String(indexCleanerThread.lastWordHash)); - prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter); - - } - return prop; - } -} diff --git a/htroot/IndexControlCleaner_p.html b/htroot/IndexControlCleaner_p.html deleted file mode 100644 index 7ef61758d..000000000 --- a/htroot/IndexControlCleaner_p.html +++ /dev/null @@ -1,108 +0,0 @@ - - - - YaCy '#[clientname]#': Index Cleaner - #%env/templates/metas.template%# - - -
- #(inline)##%env/templates/header.template%# - -

Steering of API Actions

-

This table shows search results that had been sorted out from the search result display because their content had not been verified. - This means that the searched word does not appear on the search page. -

::#(/inline)# - #(showtable)#:: -
-
- -

- #(navigation)# - :: - #(left)#no previous page::previous page#(/left)# - #[startRecord]#-#[to]# of #[of]# - #(right)#no next page::next page#(/right)# - - #(/navigation)# -

- - - - - -
-

-

- - - - - - - - - - - #(inline)#::#(/inline)# - - #{list}# - - - - - - - - - - #(inline)#::#(/inline)# - - #{/list}# - -
TypeCommentCall
Count
Recording
Date
Last Exec
Date
Next Exec
Date
SchedulerURL
#[type]##[comment]##[callcount]##[dateRecording]##[dateLastExec]##[dateNextExec]# - #(scheduler)# - - - - - - - :: -
-
- - - -
- - - - -
- #(/scheduler)# -
#[url]#
-

-

- - - -

- - - #(/showtable)# -
- #%env/templates/footer.template%# - - - \ No newline at end of file diff --git a/htroot/PerformanceQueues_p.html b/htroot/PerformanceQueues_p.html index 2fc254a01..9793e0d4d 100644 --- a/htroot/PerformanceQueues_p.html +++ b/htroot/PerformanceQueues_p.html @@ -72,14 +72,6 @@ RAM Cache Description - - URLs in RAM buffer: - #[urlCacheSize]# - - This is the size of the URL write buffer. Its purpose is to buffer incoming URLs - in case of search result transmission and during DHT transfer. - - Words in RAM cache:
(Size in KBytes) #[wordCacheSize]#
(#[wordCacheSizeKBytes]# KB) diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index de414a3a4..d9f50d0c3 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -299,7 +299,6 @@ public class PerformanceQueues_p { prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); // table cache settings - prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize()); prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize()); prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024); prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences()); diff --git a/htroot/PerformanceQueues_p.xml b/htroot/PerformanceQueues_p.xml index 8535e76a9..e7fc14d95 100644 --- a/htroot/PerformanceQueues_p.xml +++ b/htroot/PerformanceQueues_p.xml @@ -24,7 +24,6 @@ #{/table}# - #[urlCacheSize]# #[wordCacheSize]# #[maxURLinCache]# #[maxAgeOfCache]# diff --git a/htroot/env/templates/submenuBlacklist.template b/htroot/env/templates/submenuBlacklist.template index 6c157a9eb..a02bb6c1f 100644 --- a/htroot/env/templates/submenuBlacklist.template +++ b/htroot/env/templates/submenuBlacklist.template @@ -5,6 +5,5 @@
  • Blacklist Cleaner
  • Blacklist Test
  • Import/Export
  • -
  • Index Cleaner
  • \ No newline at end of file diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java deleted file mode 100644 index c2693ca52..000000000 --- a/source/de/anomic/data/URLAnalysis.java +++ /dev/null @@ -1,564 +0,0 @@ -// URLAnalysis.java -// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 24.02.2009 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -package de.anomic.data; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.net.MalformedURLException; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ConcurrentHashMap; -import java.util.regex.Pattern; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -import net.yacy.cora.document.UTF8; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.WordReferenceRow; -import net.yacy.kelondro.index.HandleMap; -import net.yacy.kelondro.index.HandleSet; -import net.yacy.kelondro.index.RowSpaceExceededException; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.order.Base64Order; -import net.yacy.kelondro.rwi.ReferenceContainerArray; -import net.yacy.kelondro.util.MemoryControl; -import net.yacy.search.index.MetadataRepository; -import net.yacy.search.index.Segment; -import net.yacy.search.index.MetadataRepository.Export; - -public class URLAnalysis { - - private static final Pattern patternMinus = Pattern.compile("-"); - - /** - * processes to analyse URL lists - */ - - private static DigestURI poison = null; - static { - try { - poison = new DigestURI("http://poison.org/poison"); - } catch (final MalformedURLException e) { - poison = null; - } - } - - public static class splitter extends Thread { - - private final ArrayBlockingQueue in; - private final ConcurrentHashMap out; - - public splitter(final ArrayBlockingQueue in, final ConcurrentHashMap out) { - this.in = in; - this.out = out; - } - - @Override - public void run() { - try { - DigestURI url; - final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_"); - while (true) { - try { - url = this.in.take(); - if (url == poison) break; - update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\.")); - update(p.matcher(url.getPath()).replaceAll("/").split("/")); - } catch (final InterruptedException e) { - Log.logException(e); - } - } - } catch (final Exception e) { - Log.logException(e); - } - } - - private void update(final String[] s) { - Integer c; - for (final String t: s) { - if (t.isEmpty()) continue; - c = this.out.get(t); - this.out.put(t, (c == null) ? 1 : c.intValue() + 1); - } - } - } - - public static void cleanup(final ConcurrentHashMap stat) { - Map.Entry entry; - int c, low = Integer.MAX_VALUE; - Iterator> i = stat.entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - c = entry.getValue().intValue(); - if (c == 1) { - i.remove(); - } else { - if (c < low) low = c; - } - } - i = stat.entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - c = entry.getValue().intValue(); - if (c == low) { - i.remove(); - } - } - Runtime.getRuntime().gc(); - } - - public static void genstat(final String urlfile) { - - final boolean gz = urlfile.endsWith(".gz"); - final String analysis = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".stats.gz" : urlfile + ".stats"; - final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8); - - // start threads - final ArrayBlockingQueue in = new ArrayBlockingQueue(1000); - final ConcurrentHashMap out = new ConcurrentHashMap(); - for (int i = 0, available = Runtime.getRuntime().availableProcessors(); i < available; i++) new splitter(in, out).start(); - final splitter spl = new splitter(in, out); - spl.start(); - - // put urls in queue - final File infile = new File(urlfile); - final File outfile = new File(analysis); - BufferedReader reader = null; - long time = System.currentTimeMillis(); - final long start = time; - int count = 0; - - System.out.println("start processing"); - try { - InputStream is = new BufferedInputStream(new FileInputStream(infile)); - if (gz) is = new GZIPInputStream(is); - reader = new BufferedReader(new InputStreamReader(is)); - String line; - while ((line = reader.readLine()) != null) { - line = line.trim(); - if (line.length() > 0) { - try { - final DigestURI url = new DigestURI(line); - in.put(url); - } catch (final InterruptedException e) { - Log.logException(e); - } catch (final MalformedURLException e) { - continue; - } - } - count++; - if (System.currentTimeMillis() - time > 1000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second"); - if (MemoryControl.available() < cleanuplimit) { - System.out.println("starting cleanup, " + out.size() + " entries in statistic"); - cleanup(out); - System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); - } - } - } - reader.close(); - } catch (final IOException e) { - Log.logException(e); - } finally { - if (reader != null) try { reader.close(); } catch (final Exception e) {} - } - - // stop threads - System.out.println("stopping threads"); - for (int i = 0, available = Runtime.getRuntime().availableProcessors() + 1; i < available; i++) try { - in.put(poison); - } catch (final InterruptedException e) { - Log.logException(e); - } - try { - spl.join(); - } catch (final InterruptedException e1) { - Log.logException(e1); - } - - // generate statistics - System.out.println("start processing results"); - final TreeMap results = new TreeMap(); - count = 0; - Map.Entry entry; - final Iterator> i = out.entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - results.put(num(entry.getValue().intValue() * (entry.getKey().length() - 1)) + " - " + entry.getKey(), entry.getValue()); - count++; - i.remove(); // free memory - if (System.currentTimeMillis() - time > 10000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); - } - } - - // write statistics - System.out.println("start writing results"); - try { - OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile)); - if (gz) os = new GZIPOutputStream(os); - count = 0; - for (final Map.Entry e: results.entrySet()) { - os.write(UTF8.getBytes(e.getKey())); - os.write(new byte[]{'\t'}); - os.write(UTF8.getBytes(Integer.toString(e.getValue()))); - os.write(new byte[]{'\n'}); - count++; - if (System.currentTimeMillis() - time > 10000) { - time = System.currentTimeMillis(); - System.out.println("wrote " + count + " lines."); - } - } - os.close(); - } catch (final IOException e) { - Log.logException(e); - } - - System.out.println("finished"); - } - - public static void genhost(final String urlfile) { - - final boolean gz = urlfile.endsWith(".gz"); - final String trunk = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".host" : urlfile + ".host"; - final HashSet hosts = new HashSet(); - final File infile = new File(urlfile); - BufferedReader reader = null; - long time = System.currentTimeMillis(); - final long start = time; - int count = 0; - - System.out.println("start processing"); - try { - InputStream is = new BufferedInputStream(new FileInputStream(infile)); - if (gz) is = new GZIPInputStream(is); - reader = new BufferedReader(new InputStreamReader(is)); - String line; - while ((line = reader.readLine()) != null) { - line = line.trim(); - if (line.length() > 0) { - try { - final DigestURI url = new DigestURI(line); - hosts.add(url.getHost()); - } catch (final MalformedURLException e) { - continue; - } - } - count++; - if (System.currentTimeMillis() - time > 1000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second"); - } - } - reader.close(); - } catch (final IOException e) { - Log.logException(e); - } finally { - if (reader != null) try { reader.close(); } catch (final Exception e) {} - } - - // copy everything into a TreeSet to order it - System.out.println("start processing results"); - final TreeSet results = new TreeSet(); - count = 0; - final Iterator i = hosts.iterator(); - while (i.hasNext()) { - results.add(i.next()); - count++; - i.remove(); // free memory - if (System.currentTimeMillis() - time > 10000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); - } - } - - // write hosts - writeSet(trunk, gz, results); - - System.out.println("finished"); - } - - private static void writeSet(final String trunk, final boolean gz, final Set set) { - - // write hosts - System.out.println("start writing results"); - final File outfile = new File(trunk + ((gz) ? ".gz" : "")); - long time = System.currentTimeMillis(); - try { - OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile)); - if (gz) os = new GZIPOutputStream(os); - int count = 0; - for (final String h: set) { - os.write(UTF8.getBytes(h)); - os.write(new byte[]{'\n'}); - count++; - if (System.currentTimeMillis() - time > 10000) { - time = System.currentTimeMillis(); - System.out.println("wrote " + count + " lines."); - } - } - os.close(); - } catch (final IOException e) { - Log.logException(e); - } - - System.out.println("finished writing results"); - } - - public static void sortsplit(final String urlfile) { - - final boolean gz = urlfile.endsWith(".gz"); - final String trunk = ((gz) ? urlfile.substring(0, urlfile.length() - 3) : urlfile) + ".sort"; - final File infile = new File(urlfile); - final TreeSet urls = new TreeSet(); - BufferedReader reader = null; - long time = System.currentTimeMillis(); - final long start = time; - int count = 0; - int filecount = 0; - final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8); - - System.out.println("start processing"); - try { - InputStream is = new BufferedInputStream(new FileInputStream(infile)); - if (gz) is = new GZIPInputStream(is); - reader = new BufferedReader(new InputStreamReader(is)); - String line; - while ((line = reader.readLine()) != null) { - line = line.trim(); - if (line.length() > 0) { - try { - final DigestURI url = new DigestURI(line); - urls.add(url.toNormalform(true, true)); - } catch (final MalformedURLException e) { - continue; - } - } - count++; - if (System.currentTimeMillis() - time > 1000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second"); - } - if (MemoryControl.available() < cleanuplimit) { - writeSet(trunk + "." + filecount, gz, urls); - filecount++; - urls.clear(); - Runtime.getRuntime().gc(); - } - } - reader.close(); - } catch (final IOException e) { - Log.logException(e); - } finally { - if (reader != null) try { reader.close(); } catch (final Exception e) {} - } - - // write hosts - writeSet(trunk + "." + filecount, gz, urls); - - System.out.println("finished"); - } - - public static void incell(final File cellPath, final String statisticPath) { - try { - final HandleMap idx = ReferenceContainerArray.referenceHashes( - cellPath, - Segment.wordReferenceFactory, - Base64Order.enhancedCoder, - WordReferenceRow.urlEntryRow); - System.out.println("INDEX REFERENCE COLLECTION starting dump of statistics"); - idx.dump(new File(statisticPath)); - System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath); - idx.close(); - } catch (final Exception e) { - Log.logException(e); - } - } - - public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException { - System.out.println("INDEX DIFF URL-COL startup"); - final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile)); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); - mr.connectUrlDb(Segment.UrlDbName, false, false); - final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000); - System.out.println("INDEX DIFF URL-COL loaded dump, starting diff"); - final long start = System.currentTimeMillis(); - long update = start - 7000; - int count = 0; - for (final byte[] refhash: mr) { - if (idx.get(refhash) == -1) { - // the key exists as urlhash in the URL database, but not in the collection as referenced urlhash - hs.put(refhash); - } - count++; - if (System.currentTimeMillis() - update > 10000) { - System.out.println("INDEX DIFF URL-COL running, checked " + count + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - count) / count) / 60000) + " minutes remaining"); - update = System.currentTimeMillis(); - } - } - idx.close(); - mr.close(); - System.out.println("INDEX DIFF URL-COL finished diff, starting dump to " + diffFile); - count = hs.dump(new File(diffFile)); - System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump"); - return count; - } - - public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException { - // format: 0=text, 1=html, 2=rss/xml - System.out.println("URL EXPORT startup"); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); - mr.connectUrlDb(Segment.UrlDbName, false, false); - final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile)); - System.out.println("URL EXPORT loaded dump, starting export"); - final Export e = mr.export(new File(export), ".*", hs, format, false); - try { - e.join(); - } catch (final InterruptedException e1) { - Log.logException(e1); - } - System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries"); - } - - public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException { - System.out.println("URL DELETE startup"); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); - mr.connectUrlDb(Segment.UrlDbName, false, false); - final int mrSize = mr.size(); - final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile)); - System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize); - for (final byte[] refhash: hs) { - mr.remove(refhash); - } - System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database"); - } - - public static void main(final String[] args) { - if (args[0].equals("-stat") && args.length >= 2) { - // generate a statistics about common words in file, store to .stat - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz - for (int i = 1; i < args.length; i++) genstat(args[i]); - } else if (args[0].equals("-host") && args.length >= 2) { - // generate a file .host containing only the hosts of the urls - for (int i = 1; i < args.length; i++) genhost(args[i]); - } else if (args[0].equals("-sort") && args.length >= 2) { - // generate file .x.sort with sorted lists and split the file in smaller pieces - for (int i = 1; i < args.length; i++) sortsplit(args[i]); - } else if (args[0].equals("-incell") && args.length >= 2) { - // generate a dump of all referenced URL hashes from a given RICELL - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incell DATA/INDEX/freeworld/TEXT/RICELL used.dump - incell(new File(args[1]), args[2]); - } else if (args[0].equals("-diffurlcol") && args.length >= 3) { - // make a diff-file that contains hashes from the url database that do not occur in the collection reference dump - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump - try { - diffurlcol(args[1], args[2], args[3]); - } catch (final Exception e) { - Log.logException(e); - } - } else if (args[0].equals("-export") && args.length >= 4) { - // export a url-list file - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump - // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html' - final int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0; - try { - export(args[1], format, args[3], (args.length >= 5) ? args[4] : null); - } catch (final Exception e) { - Log.logException(e); - } - } else if (args[0].equals("-delete") && args.length >= 3) { - // delete from URLs as given by urlreference diff dump - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump - // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html' - try { - delete(args[1], args[2]); - } catch (final Exception e) { - Log.logException(e); - } - } else { - System.out.println("usage:"); - System.out.println(); - System.out.println("-stat "); - System.out.println(" generate a statistics about common words in file, store to .stat"); - System.out.println(); - System.out.println("-host "); - System.out.println(" generate a file .host containing only the hosts of the urls"); - System.out.println(); - System.out.println("-sort "); - System.out.println(" generate file .x.sort with sorted lists and split the file in smaller pieces"); - System.out.println(); - System.out.println("-incollection "); - System.out.println(" generate a dump of all referenced URL hashes"); - System.out.println(); - System.out.println("-diffurlcol "); - System.out.println(" find URLs that occur in url-db but not in collections"); - System.out.println(); - System.out.println("-export "); - System.out.println(" export urls to file. the last argument can be omitted, then all urls are exported"); - System.out.println(); - System.out.println("-delete "); - System.out.println(" delete all urls that are listed in the diff-dump from the url-db"); - System.out.println(); - System.out.println("to do a complete clean-up of the url database, start the following:"); - System.out.println(); - System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump"); - System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump"); - System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump"); - System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump"); - System.out.println(); - } - System.exit(0); // kill remaining threads - } - - private static final String num(final int i) { - final StringBuilder s = new StringBuilder(Integer.toString(i)); - while (s.length() < 9) s.insert(0, "0"); - return s.toString(); - } -} diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index d872885a4..fd458f54c 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -33,21 +33,17 @@ import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.order.CloneableIterator; -import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.services.federated.solr.DoubleSolrConnector; import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ScoreMap; -import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; @@ -61,15 +57,11 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.util.MemoryControl; -import net.yacy.repository.Blacklist; -import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.search.solr.EmbeddedSolrConnector; import org.apache.lucene.util.Version; -import de.anomic.crawler.CrawlStacker; - public final class MetadataRepository implements /*Metadata,*/ Iterable { // class objects @@ -186,26 +178,20 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.solr.close(); } - public int writeCacheSize() { - if (this.urlIndexFile != null && this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize(); - if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize(); - return 0; - } - /** * generates an plasmaLURLEntry using the url hash * if the url cannot be found, this returns null * @param obrwi * @return */ - public URIMetadata load(final WeakPriorityBlockingQueue.Element obrwi) { - if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element - final byte[] urlHash = obrwi.getElement().urlhash(); + public URIMetadata load(WordReferenceVars wre, long weight) { + if (wre == null) return null; // all time was already wasted in takeRWI to get another element + final byte[] urlHash = wre.urlhash(); if (urlHash == null) return null; if (this.urlIndexFile != null) try { final Row.Entry entry = this.urlIndexFile.get(urlHash, false); if (entry == null) return null; - return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight()); + return new URIMetadataRow(entry, wre, weight); } catch (final IOException e) { Log.logException(e); } @@ -280,29 +266,25 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable public boolean exists(final byte[] urlHash) { if (urlHash == null) return false; + if (this.urlIndexFile != null && this.urlIndexFile.has(urlHash)) return true; try { if (this.solr.exists(ASCII.String(urlHash))) return true; } catch (final Throwable e) { Log.logException(e); } - if (this.urlIndexFile == null) return false; // case may happen during shutdown - return this.urlIndexFile.has(urlHash); + return false; } - public CloneableIterator keys(final boolean up, final byte[] firstKey) { + @Override + public Iterator iterator() { try { - return this.urlIndexFile.keys(up, firstKey); + return this.urlIndexFile.keys(true, null); } catch (final IOException e) { Log.logException(e); return null; } } - @Override - public Iterator iterator() { - return keys(true, null); - } - public CloneableIterator entries() throws IOException { // enumerates entry elements return new kiter(); @@ -367,186 +349,6 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable } } - - /** - * Uses an Iteration over urlHash.db to detect malformed URL-Entries. - * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function. - * - * @param proxyConfig - */ - public void deadlinkCleaner() { - final Log log = new Log("URLDBCLEANUP"); - final HashSet damagedURLS = new HashSet(); - try { - final Iterator eiter = entries(true, null); - int iteratorCount = 0; - while (eiter.hasNext()) try { - eiter.next(); - iteratorCount++; - } catch (final RuntimeException e) { - if(e.getMessage() != null) { - final String m = e.getMessage(); - damagedURLS.add(m.substring(m.length() - 12)); - } else { - log.logSevere("RuntimeException:", e); - } - } - log.logInfo("URLs vorher: " + this.urlIndexFile.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size()); - - final HTTPClient client = new HTTPClient(); - final Iterator eiter2 = damagedURLS.iterator(); - byte[] urlHashBytes; - while (eiter2.hasNext()) { - urlHashBytes = ASCII.getBytes(eiter2.next()); - - // trying to fix the invalid URL - String oldUrlStr = null; - try { - // getting the url data as byte array - final Row.Entry entry = this.urlIndexFile.get(urlHashBytes, true); - - // getting the wrong url string - oldUrlStr = entry.getColUTF8(1).trim(); - - int pos = -1; - if ((pos = oldUrlStr.indexOf("://",0)) != -1) { - // trying to correct the url - final String newUrlStr = "http://" + oldUrlStr.substring(pos + 3); - final DigestURI newUrl = new DigestURI(newUrlStr); - - if (client.HEADResponse(newUrl.toString()) != null - && client.getHttpResponse().getStatusLine().getStatusCode() == 200) { - entry.setCol(1, UTF8.getBytes(newUrl.toString())); - this.urlIndexFile.put(entry); - if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr); - } else { - remove(urlHashBytes); - if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (client.getHttpResponse() == null ? "null" : client.getHttpResponse().getStatusLine())); - } - } - } catch (final Exception e) { - remove(urlHashBytes); - if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage()); - } - } - - log.logInfo("URLs nachher: " + size() + " kaputte URLs: " + damagedURLS.size()); - } catch (final IOException e) { - log.logSevere("IOException", e); - } - } - - public BlacklistCleaner getBlacklistCleaner(final Blacklist blacklist, final CrawlStacker crawlStacker) { - return new BlacklistCleaner(blacklist, crawlStacker); - } - - public class BlacklistCleaner extends Thread { - - private boolean run = true; - private boolean pause; - public int blacklistedUrls = 0; - public int totalSearchedUrls = 1; - public String lastBlacklistedUrl = ""; - public String lastBlacklistedHash = ""; - public String lastUrl = ""; - public String lastHash = ""; - private final Blacklist blacklist; - private final CrawlStacker crawlStacker; - - public BlacklistCleaner(final Blacklist blacklist, final CrawlStacker crawlStacker) { - this.blacklist = blacklist; - this.crawlStacker = crawlStacker; - } - - @Override - public void run() { - try { - Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet"); - final Iterator eiter = entries(true, null); - while (eiter.hasNext() && this.run) { - synchronized (this) { - if (this.pause) { - try { - this.wait(); - } catch (final InterruptedException e) { - Log.logWarning("URLDBCLEANER", "InterruptedException", e); - this.run = false; - return; - } - } - } - final URIMetadata entry = eiter.next(); - if (entry == null) { - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null"); - } else if (entry.hash() == null) { - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + "hash == null"); - } else { - this.totalSearchedUrls++; - if (entry.url() == null) { - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + "URL == null"); - remove(entry.hash()); - continue; - } - if (this.blacklist.isListed(BlacklistType.CRAWLER, entry) || - this.blacklist.isListed(BlacklistType.DHT, entry) || - (this.crawlStacker.urlInAcceptedDomain(entry.url()) != null)) { - this.lastBlacklistedUrl = entry.url().toNormalform(true, true); - this.lastBlacklistedHash = ASCII.String(entry.hash()); - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + " " + entry.url().toNormalform(false, true)); - remove(entry.hash()); - if (this.blacklistedUrls % 100 == 0) { - Log.logInfo("URLDBCLEANER", "Deleted " + this.blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + this.lastBlacklistedUrl); - } - } - this.lastUrl = entry.url().toNormalform(true, true); - this.lastHash = ASCII.String(entry.hash()); - } - } - } catch (final RuntimeException e) { - if (e.getMessage() != null && e.getMessage().indexOf("not found in LURL",0) != -1) { - Log.logWarning("URLDBCLEANER", "urlHash not found in LURL", e); - } - else { - Log.logWarning("URLDBCLEANER", "RuntimeException", e); - this.run = false; - } - } catch (final IOException e) { - Log.logException(e); - this.run = false; - } catch (final Exception e) { - Log.logException(e); - this.run = false; - } - Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread stopped"); - } - - public void abort() { - synchronized(this) { - this.run = false; - notifyAll(); - } - } - - public void pause() { - synchronized(this) { - if (!this.pause) { - this.pause = true; - Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread paused"); - } - } - } - - public void endPause() { - synchronized(this) { - if (this.pause) { - this.pause = false; - notifyAll(); - Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread resumed"); - } - } - } - } - // export methods public Export export(final File f, final String filter, final HandleSet set, final int format, final boolean dom) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index f3c16058d..51b7a3ee4 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -33,7 +33,6 @@ import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.Set; -import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; @@ -55,7 +54,6 @@ import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceRow; -import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; @@ -66,7 +64,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.LookAheadIterator; -import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; import net.yacy.search.query.RWIProcess; @@ -252,7 +249,7 @@ public class Segment { * @param host * @return an iterator for all url hashes that belong to a specific host */ - public Iterator hostSelector(String host) { + private Iterator hostSelector(String host) { String hh = DigestURI.hosthash(host); final HandleSet ref = new HandleSet(12, Base64Order.enhancedCoder, 100); for (byte[] b: this.urlMetadata) { @@ -551,12 +548,6 @@ public class Segment { return newEntry; } - - // method for index deletion - public int removeAllUrlReferences(final DigestURI url, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { - return removeAllUrlReferences(url.hash(), loader, cacheStrategy); - } - public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { for (final byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy); } @@ -604,129 +595,4 @@ public class Segment { } } - - // The Cleaner class was provided as "UrldbCleaner" by Hydrox - public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) { - return new ReferenceCleaner(startHash); - } - - public class ReferenceCleaner extends Thread { - - private final byte[] startHash; - private boolean run = true; - private boolean pause = false; - public int rwiCountAtStart = 0; - public byte[] wordHashNow = null; - public byte[] lastWordHash = null; - public int lastDeletionCounter = 0; - - public ReferenceCleaner(final byte[] startHash) { - this.startHash = startHash; - this.rwiCountAtStart = termIndex().sizesMax(); - } - - @Override - public void run() { - Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); - ReferenceContainer container = null; - WordReferenceVars entry = null; - DigestURI url = null; - final HandleSet urlHashs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); - try { - Iterator> indexContainerIterator = Segment.this.termIndex.referenceContainer(this.startHash, false, false, 100, false).iterator(); - while (indexContainerIterator.hasNext() && this.run) { - waiter(); - container = indexContainerIterator.next(); - final Iterator containerIterator = container.entries(); - this.wordHashNow = container.getTermHash(); - while (containerIterator.hasNext() && this.run) { - waiter(); - entry = new WordReferenceVars(containerIterator.next()); - // System.out.println("Wordhash: "+wordHash+" UrlHash: - // "+entry.getUrlHash()); - final URIMetadata ue = Segment.this.urlMetadata.load(entry.urlhash()); - if (ue == null) { - urlHashs.put(entry.urlhash()); - } else { - url = ue.url(); - if (url == null || Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) { - urlHashs.put(entry.urlhash()); - } - } - } - if (!urlHashs.isEmpty()) try { - final int removed = Segment.this.termIndex.remove(container.getTermHash(), urlHashs); - Log.logFine("INDEXCLEANER", ASCII.String(container.getTermHash()) + ": " + removed + " of " + container.size() + " URL-entries deleted"); - this.lastWordHash = container.getTermHash(); - this.lastDeletionCounter = urlHashs.size(); - urlHashs.clear(); - } catch (final IOException e) { - Log.logException(e); - } - - if (!containerIterator.hasNext()) { - // We may not be finished yet, try to get the next chunk of wordHashes - final TreeSet> containers = Segment.this.termIndex.referenceContainer(container.getTermHash(), false, false, 100, false); - indexContainerIterator = containers.iterator(); - // Make sure we don't get the same wordhash twice, but don't skip a word - if ((indexContainerIterator.hasNext()) && (!container.getTermHash().equals(indexContainerIterator.next().getTermHash()))) { - indexContainerIterator = containers.iterator(); - } - } - } - } catch (final IOException e) { - Log.logException(e); - } catch (final Exception e) { - Log.logException(e); - } - Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread stopped"); - } - - public void abort() { - synchronized(this) { - this.run = false; - notifyAll(); - } - } - - public void pause() { - synchronized (this) { - if (!this.pause) { - this.pause = true; - Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread paused"); - } - } - } - - public void endPause() { - synchronized (this) { - if (this.pause) { - this.pause = false; - notifyAll(); - Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread resumed"); - } - } - } - - public void waiter() { - synchronized (this) { - if (this.pause) { - try { - this.wait(); - } catch (final InterruptedException e) { - this.run = false; - return; - } - } - } - } - - public int rwisize() { - return termIndex().sizesMax(); - } - - public int urlsize() { - return urlMetadata().size(); - } - } } diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index 3248598d8..93e975e9e 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -628,7 +628,7 @@ public final class RWIProcess extends Thread if ( obrwi == null ) { return null; // all time was already wasted in takeRWI to get another element } - final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi); + final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi.getElement(), obrwi.getWeight()); if ( page == null ) { try { this.misses.putUnique(obrwi.getElement().urlhash()); diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index d10281b63..b9e44cd7e 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -24,7 +24,6 @@ package net.yacy; -import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; @@ -38,54 +37,33 @@ import java.io.PrintWriter; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import java.nio.channels.FileLock; -import java.util.Iterator; -import java.util.Map; import java.util.Properties; -import java.util.TreeMap; -import java.util.TreeSet; import java.util.concurrent.Semaphore; -import java.util.zip.ZipEntry; -import java.util.zip.ZipOutputStream; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.sorting.Array; -import net.yacy.cora.sorting.OrderedScoreMap; -import net.yacy.cora.sorting.ScoreMap; import net.yacy.gui.YaCyApp; import net.yacy.gui.framework.Browser; -import net.yacy.kelondro.blob.MapDataMining; -import net.yacy.kelondro.data.meta.URIMetadata; -import net.yacy.kelondro.data.word.Word; -import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.order.Base64Order; -import net.yacy.kelondro.rwi.Reference; -import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.Formatter; import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.OS; -import net.yacy.peers.SeedDB; import net.yacy.peers.operation.yacyBuildProperties; import net.yacy.peers.operation.yacyRelease; import net.yacy.peers.operation.yacyVersion; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; -import net.yacy.search.index.MetadataRepository; -import net.yacy.search.index.Segment; import com.google.common.io.Files; import de.anomic.data.Translator; import de.anomic.http.server.HTTPDemon; import de.anomic.server.serverCore; -import de.anomic.tools.enumerateFiles; /** * This is the main class of YaCy. Several threads are started from here: @@ -595,346 +573,6 @@ public final class yacy { Log.logConfig("COMMAND-STEERING", "SUCCESSFULLY FINISHED COMMAND: " + processdescription); } - /** - * This method gets all found words and outputs a statistic about the score - * of the words. The output of this method can be used to create stop-word - * lists. This method will be called if you start yacy with the argument - * -genwordstat. - * FIXME: How can stop-word list be created from this output? What type of - * score is output? - * - * @param homePath Root-Path where all the information is to be found. - */ - private static void genWordstat(final File homePath) { - // start up - System.out.println(copyright); - System.out.println(hline); - - // load words - Log.logInfo("GEN-WORDSTAT", "loading words..."); - final TreeMap words = loadWordMap(new File(homePath, "yacy.words")); - - // find all hashes - Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases..."); - final File dbRoot = new File(homePath, "DATA/INDEX/freeworld/"); - final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true); - File f; - byte[] h; - final ScoreMap hs = new OrderedScoreMap(Base64Order.standardCoder); - while (ef.hasMoreElements()) { - f = ef.nextElement(); - h = f.getName().substring(0, Word.commonHashLength).getBytes(); - hs.inc(h, (int) f.length()); - } - - // list the hashes in reverse order - Log.logInfo("GEN-WORDSTAT", "listing words in reverse size order..."); - String w; - final Iterator i = hs.keys(false); - while (i.hasNext()) { - h = i.next(); - w = words.get(h); - if (w == null) System.out.print("# " + h); else System.out.print(w); - System.out.println(" - " + hs.get(h)); - } - - // finished - Log.logConfig("GEN-WORDSTAT", "FINISHED"); - } - - /** - * @param homePath path to the YaCy directory - * @param networkName - */ - public static void minimizeUrlDB(final File dataHome, final File appHome, final String networkName) { - // run with "java -classpath classes yacy -minimizeUrlDB" - try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX"); - final File indexRoot2 = new File(dataHome, "DATA/INDEX2"); - final Log log = new Log("URL-CLEANUP"); - try { - log.logInfo("STARTING URL CLEANUP"); - - // db containing all currently loades urls - final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT")); - currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false); - - // db used to hold all neede urls - final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT")); - minimizedUrlDB.connectUrlDb(Segment.UrlDbName, false, false); - - final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total()); - if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); - - final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null); - wordIndex.connectRWI(10000, Integer.MAX_VALUE); - wordIndex.connectUrlDb(false, false); - final Iterator> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false); - - long urlCounter = 0, wordCounter = 0; - long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0; - String wordChunkStartHash = "AAAAAAAAAAAA", wordChunkEndHash; - - while (indexContainerIterator.hasNext()) { - ReferenceContainer wordIdxContainer = null; - try { - wordCounter++; - wordIdxContainer = indexContainerIterator.next(); - - // the combined container will fit, read the container - final Iterator wordIdxEntries = wordIdxContainer.entries(); - Reference iEntry; - while (wordIdxEntries.hasNext()) { - iEntry = wordIdxEntries.next(); - final byte[] urlHash = iEntry.urlhash(); - if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - final URIMetadata urlEntry = currentUrlDB.load(urlHash); - urlCounter++; - minimizedUrlDB.store(urlEntry); - if (urlCounter % 500 == 0) { - log.logInfo(urlCounter + " URLs found so far."); - } - } catch (final IOException e) {} - } - - if (wordCounter%500 == 0) { - wordChunkEndHash = ASCII.String(wordIdxContainer.getTermHash()); - wordChunkEnd = System.currentTimeMillis(); - final long duration = wordChunkEnd - wordChunkStart; - log.logInfo(wordCounter + " words scanned " + - "[" + wordChunkStartHash + " .. " + wordChunkEndHash + "]\n" + - "Duration: "+ 500*1000/duration + " words/s" + - " | Free memory: " + MemoryControl.free() + - " | Total memory: " + MemoryControl.total()); - wordChunkStart = wordChunkEnd; - wordChunkStartHash = wordChunkEndHash; - } - - // we have read all elements, now we can close it - wordIdxContainer = null; - - } catch (final Exception e) { - log.logSevere("Exception", e); - } finally { - if (wordIdxContainer != null) try { wordIdxContainer = null; } catch (final Exception e) {} - } - } - log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries."); - log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries."); - - currentUrlDB.close(); - minimizedUrlDB.close(); - wordIndex.close(); - - // TODO: rename the mimimized UrlDB to the name of the previous UrlDB - - log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP"); - log.logInfo("You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db"); - - log.logInfo("TERMINATED URL CLEANUP"); - } catch (final Exception e) { - log.logSevere("Exception: " + e.getMessage(), e); - } catch (final Error e) { - log.logSevere("Error: " + e.getMessage(), e); - } - } - - /** - * Reads all words from the given file and creates a treemap, where key is - * the plasma word hash and value is the word itself. - * - * @param wordlist File where the words are stored. - * @return HashMap with the hash-word - relation. - */ - private static TreeMap loadWordMap(final File wordlist) { - // returns a hash-word - Relation - final TreeMap wordmap = new TreeMap(Base64Order.enhancedCoder); - try { - String word; - final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist))); - while ((word = br.readLine()) != null) wordmap.put(Word.word2hash(word), word); - br.close(); - } catch (final IOException e) {} - return wordmap; - } - - /** - * Cleans a wordlist in a file according to the length of the words. The - * file with the given filename is read and then only the words in the given - * length-range are written back to the file. - * - * @param wordlist Name of the file the words are stored in. - * @param minlength Minimal needed length for each word to be stored. - * @param maxlength Maximal allowed length for each word to be stored. - */ - private static void cleanwordlist(final String wordlist, final int minlength, final int maxlength) { - // start up - System.out.println(copyright); - System.out.println(hline); - Log.logConfig("CLEAN-WORDLIST", "START"); - - String word; - final TreeSet wordset = new TreeSet(); - int count = 0; - try { - final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist))); - final String seps = "' .,:/-&"; - while ((word = br.readLine()) != null) { - word = word.toLowerCase().trim(); - for (int i = 0; i < seps.length(); i++) { - if (word.indexOf(seps.charAt(i)) >= 0) word = word.substring(0, word.indexOf(seps.charAt(i))); - } - if ((word.length() >= minlength) && (word.length() <= maxlength)) wordset.add(word); - count++; - } - br.close(); - - if (wordset.size() != count) { - count = count - wordset.size(); - final BufferedWriter bw = new BufferedWriter(new PrintWriter(new FileWriter(wordlist))); - while (!wordset.isEmpty()) { - word = wordset.first(); - bw.write(word + "\n"); - wordset.remove(word); - } - bw.close(); - Log.logInfo("CLEAN-WORDLIST", "shrinked wordlist by " + count + " words."); - } else { - Log.logInfo("CLEAN-WORDLIST", "not necessary to change wordlist"); - } - } catch (final IOException e) { - Log.logSevere("CLEAN-WORDLIST", "ERROR: " + e.getMessage()); - System.exit(-1); - } - - // finished - Log.logConfig("CLEAN-WORDLIST", "FINISHED"); - } - - private static String[] shift(final String[] args, final int pos, final int count) { - final String[] newargs = new String[args.length - count]; - System.arraycopy(args, 0, newargs, 0, pos); - System.arraycopy(args, pos + count, newargs, pos, args.length - pos - count); - return newargs; - } - - /** - * Uses an Iteration over urlHash.db to detect malformed URL-Entries. - * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function. - * - * @param homePath Root-Path where all information is to be found. - */ - private static void urldbcleanup(final File dataHome, final File appHome, final String networkName) { - final File root = dataHome; - final File indexroot = new File(root, "DATA/INDEX"); - try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT")); - currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false); - currentUrlDB.deadlinkCleaner(); - currentUrlDB.close(); - } - - private static void RWIHashList(final File dataHome, final File appHome, final String targetName, final String resource, final String format) { - Segment WordIndex = null; - final Log log = new Log("HASHLIST"); - final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX"); - final String wordChunkStartHash = "AAAAAAAAAAAA"; - try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - log.logInfo("STARTING CREATION OF RWI-HASHLIST"); - final File root = dataHome; - try { - Iterator> indexContainerIterator = null; - if (resource.equals("all")) { - WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null); - WordIndex.connectRWI(10000, Integer.MAX_VALUE); - WordIndex.connectUrlDb(false, false); - indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false); - } - int counter = 0; - ReferenceContainer container = null; - if (format.equals("zip")) { - log.logInfo("Writing Hashlist to ZIP-file: " + targetName + ".zip"); - final ZipEntry zipEntry = new ZipEntry(targetName + ".txt"); - final File file = new File(root, targetName + ".zip"); - final ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file)); - bos.putNextEntry(zipEntry); - if(indexContainerIterator != null) { - while (indexContainerIterator.hasNext()) { - counter++; - container = indexContainerIterator.next(); - bos.write(container.getTermHash()); - bos.write(serverCore.CRLF); - if (counter % 500 == 0) { - log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash())); - } - } - } - bos.flush(); - bos.close(); - } else { - log.logInfo("Writing Hashlist to TXT-file: " + targetName + ".txt"); - final File file = new File(root, targetName + ".txt"); - final BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); - if(indexContainerIterator != null) { - while (indexContainerIterator.hasNext()) { - counter++; - container = indexContainerIterator.next(); - bos.write(container.getTermHash()); - bos.write(serverCore.CRLF); - if (counter % 500 == 0) { - log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash())); - } - } - } - bos.flush(); - bos.close(); - } - log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : ASCII.String(container.getTermHash()))); - } catch (final IOException e) { - log.logSevere("IOException", e); - } - if (WordIndex != null) { - WordIndex.close(); - WordIndex = null; - } - } - - /** - * Searching for peers affected by Bug - * @param homePath - */ - public static void testPeerDB(final File homePath) { - - try { - final File yacyDBPath = new File(homePath, "DATA/INDEX/freeworld/NETWORK"); - - final String[] dbFileNames = {"seed.new.db","seed.old.db","seed.pot.db"}; - for (final String dbFileName : dbFileNames) { - final File dbFile = new File(yacyDBPath,dbFileName); - final MapDataMining db = new MapDataMining(dbFile, Word.commonHashLength, Base64Order.enhancedCoder, 1024 * 512, 500, SeedDB.sortFields, SeedDB.longaccFields, SeedDB.doubleaccFields); - - Iterator>> it; - it = db.entries(true, false); - while (it.hasNext()) { - final Map.Entry> dna = it.next(); - String peerHash = UTF8.String(dna.getKey()); - if (peerHash.length() < Word.commonHashLength) { - final String peerName = dna.getValue().get("Name"); - final String peerIP = dna.getValue().get("IP"); - final String peerPort = dna.getValue().get("Port"); - - while (peerHash.length() < Word.commonHashLength) { peerHash = peerHash + "_"; } - System.err.println("Invalid Peer-Hash found in '" + dbFileName + "': " + peerName + ":" + peerHash + ", http://" + peerIP + ":" + peerPort); - } - } - db.close(); - } - } catch (final Exception e) { - Log.logException(e); - } - } - - /** * Main-method which is started by java. Checks for special arguments or * starts up the application. @@ -993,46 +631,6 @@ public final class yacy { } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-version"))) { // show yacy version System.out.println(copyright); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) { - // migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible - // attention: this may run long and should not be interrupted! - if (args.length >= 3 && args[1].toLowerCase().equals("-cache")) { - args = shift(args, 1, 2); - } - if (args.length == 2) applicationRoot= new File(args[1]); - minimizeUrlDB(dataRoot, applicationRoot, "freeworld"); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-testpeerdb"))) { - if (args.length == 2) { - applicationRoot = new File(args[1]); - } else if (args.length > 2) { - System.err.println("Usage: -testPeerDB [homeDbRoot]"); - } - testPeerDB(applicationRoot); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-genwordstat"))) { - // this can help to create a stop-word list - // to use this, you need a 'yacy.words' file in the root path - // start this with "java -classpath classes yacy -genwordstat []" - if (args.length == 2) applicationRoot= new File(args[1]); - genWordstat(applicationRoot); - } else if ((args.length == 4) && (args[0].toLowerCase().equals("-cleanwordlist"))) { - // this can be used to organize and clean a word-list - // start this with "java -classpath classes yacy -cleanwordlist " - final int minlength = Integer.parseInt(args[2]); - final int maxlength = Integer.parseInt(args[3]); - cleanwordlist(args[1], minlength, maxlength); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { - // generate a url list and save it in a file - if (args.length == 2) applicationRoot= new File(args[1]); - urldbcleanup(dataRoot, applicationRoot, "freeworld"); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-rwihashlist"))) { - // generate a url list and save it in a file - String domain = "all"; - String format = "txt"; - if (args.length >= 2) domain= args[1]; - if (args.length >= 3) format= args[2]; - if (args.length == 4) applicationRoot= new File(args[3]); - final String outfile = "rwihashlist_" + System.currentTimeMillis(); - RWIHashList(dataRoot, applicationRoot, outfile, domain, format); } else { if (args.length == 1) applicationRoot= new File(args[0]); startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, false);