From 3bcd9d622b1988e221d271bdb088fd1fbcd5e018 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 25 Jul 2012 14:31:54 +0200 Subject: [PATCH] cleaned up classes and methods which are either superfluous at this time or will be superfluous or subject of complete redesign after the migration to solr. Removing these things now will make the transition to solr more simple. --- htroot/IndexCleaner_p.java | 108 ---- htroot/IndexControlCleaner_p.html | 108 ---- htroot/PerformanceQueues_p.html | 8 - htroot/PerformanceQueues_p.java | 1 - htroot/PerformanceQueues_p.xml | 1 - .../env/templates/submenuBlacklist.template | 1 - source/de/anomic/data/URLAnalysis.java | 564 ------------------ .../yacy/search/index/MetadataRepository.java | 216 +------ source/net/yacy/search/index/Segment.java | 136 +---- source/net/yacy/search/query/RWIProcess.java | 2 +- source/net/yacy/yacy.java | 402 ------------- 11 files changed, 11 insertions(+), 1536 deletions(-) delete mode 100644 htroot/IndexCleaner_p.java delete mode 100644 htroot/IndexControlCleaner_p.html delete mode 100644 source/de/anomic/data/URLAnalysis.java diff --git a/htroot/IndexCleaner_p.java b/htroot/IndexCleaner_p.java deleted file mode 100644 index c30ea0e7c..000000000 --- a/htroot/IndexCleaner_p.java +++ /dev/null @@ -1,108 +0,0 @@ -//----------------------- -//part of the AnomicHTTPD caching proxy -//(C) by Michael Peter Christen; mc@yacy.net -//first published on http://www.anomic.de -//Frankfurt, Germany, 2005 -// -//This file is contributed by Matthias Soehnholz -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -//This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by -//the Free Software Foundation; either version 2 of the License, or -//(at your option) any later version. -// -//This program is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. -// -//You should have received a copy of the GNU General Public License -//along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.search.Switchboard; -import net.yacy.search.index.MetadataRepository; -import net.yacy.search.index.Segment; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class IndexCleaner_p { - private static MetadataRepository.BlacklistCleaner urldbCleanerThread = null; - private static Segment.ReferenceCleaner indexCleanerThread = null; - - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { - final serverObjects prop = new serverObjects(); - final Switchboard sb = (Switchboard) env; - prop.put("title", "DbCleanup_p"); - - // get segment - Segment indexSegment = sb.index; - - if (post!=null) { - if (post.get("action").equals("ustart")) { - if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) { - urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist, sb.crawlStacker); - urldbCleanerThread.start(); - } - else { - urldbCleanerThread.endPause(); - } - } - else if (post.get("action").equals("ustop") && (urldbCleanerThread!=null)) { - urldbCleanerThread.abort(); - } - else if (post.get("action").equals("upause") && (urldbCleanerThread!=null)) { - urldbCleanerThread.pause(); - } - else if (post.get("action").equals("rstart")) { - if (indexCleanerThread==null || !indexCleanerThread.isAlive()) { - indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes()); - indexCleanerThread.start(); - } - else { - indexCleanerThread.endPause(); - } - } - else if (post.get("action").equals("rstop") && (indexCleanerThread!=null)) { - indexCleanerThread.abort(); - } - else if (post.get("action").equals("rpause") && (indexCleanerThread!=null)) { - indexCleanerThread.pause(); - } - prop.put("LOCATION",""); - return prop; - } - if (urldbCleanerThread!=null) { - prop.put("urldb", "1"); - prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100); - prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls); - prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls); - prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl); - prop.put("urldb_lastBlacklistedHash", urldbCleanerThread.lastBlacklistedHash); - prop.putHTML("urldb_lastUrl", urldbCleanerThread.lastUrl); - prop.put("urldb_lastHash", urldbCleanerThread.lastHash); - prop.put("urldb_threadAlive", Boolean.toString(urldbCleanerThread.isAlive())); - prop.put("urldb_threadToString", urldbCleanerThread.toString()); - final double percent = ((double)urldbCleanerThread.blacklistedUrls/urldbCleanerThread.totalSearchedUrls)*100; - prop.putNum("urldb_percent", percent); - } - if (indexCleanerThread!=null) { - prop.put("rwidb", "1"); - prop.put("rwidb_threadAlive", Boolean.toString(indexCleanerThread.isAlive())); - prop.put("rwidb_threadToString", indexCleanerThread.toString()); - prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart); - prop.putNum("rwidb_RWIcountnow", indexCleanerThread.rwisize()); - prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : ASCII.String(indexCleanerThread.wordHashNow)); - prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : ASCII.String(indexCleanerThread.lastWordHash)); - prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter); - - } - return prop; - } -} diff --git a/htroot/IndexControlCleaner_p.html b/htroot/IndexControlCleaner_p.html deleted file mode 100644 index 7ef61758d..000000000 --- a/htroot/IndexControlCleaner_p.html +++ /dev/null @@ -1,108 +0,0 @@ - - - - YaCy '#[clientname]#': Index Cleaner - #%env/templates/metas.template%# - - -
- #(inline)##%env/templates/header.template%# - -

Steering of API Actions

-

This table shows search results that had been sorted out from the search result display because their content had not been verified. - This means that the searched word does not appear on the search page. -

::#(/inline)# - #(showtable)#:: -
-
- -

- #(navigation)# - :: - #(left)#no previous page::previous page#(/left)# - #[startRecord]#-#[to]# of #[of]# - #(right)#no next page::next page#(/right)# - - #(/navigation)# -

- - - - - -
-

-

- - - - - - - - - - - #(inline)#::#(/inline)# - - #{list}# - - - - - - - - - - #(inline)#::#(/inline)# - - #{/list}# - -
TypeCommentCall
Count
Recording
Date
Last Exec
Date
Next Exec
Date
SchedulerURL
#[type]##[comment]##[callcount]##[dateRecording]##[dateLastExec]##[dateNextExec]# - #(scheduler)# - - - - - - - :: -
-
- - - -
- - - - -
- #(/scheduler)# -
#[url]#
-

-

- - - -

- - - #(/showtable)# -
- #%env/templates/footer.template%# - - - \ No newline at end of file diff --git a/htroot/PerformanceQueues_p.html b/htroot/PerformanceQueues_p.html index 2fc254a01..9793e0d4d 100644 --- a/htroot/PerformanceQueues_p.html +++ b/htroot/PerformanceQueues_p.html @@ -72,14 +72,6 @@ RAM Cache Description - - URLs in RAM buffer: - #[urlCacheSize]# - - This is the size of the URL write buffer. Its purpose is to buffer incoming URLs - in case of search result transmission and during DHT transfer. - - Words in RAM cache:
(Size in KBytes) #[wordCacheSize]#
(#[wordCacheSizeKBytes]# KB) diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index de414a3a4..d9f50d0c3 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -299,7 +299,6 @@ public class PerformanceQueues_p { prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); // table cache settings - prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize()); prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize()); prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024); prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences()); diff --git a/htroot/PerformanceQueues_p.xml b/htroot/PerformanceQueues_p.xml index 8535e76a9..e7fc14d95 100644 --- a/htroot/PerformanceQueues_p.xml +++ b/htroot/PerformanceQueues_p.xml @@ -24,7 +24,6 @@ #{/table}# - #[urlCacheSize]# #[wordCacheSize]# #[maxURLinCache]# #[maxAgeOfCache]# diff --git a/htroot/env/templates/submenuBlacklist.template b/htroot/env/templates/submenuBlacklist.template index 6c157a9eb..a02bb6c1f 100644 --- a/htroot/env/templates/submenuBlacklist.template +++ b/htroot/env/templates/submenuBlacklist.template @@ -5,6 +5,5 @@
  • Blacklist Cleaner
  • Blacklist Test
  • Import/Export
  • -
  • Index Cleaner
  • \ No newline at end of file diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java deleted file mode 100644 index c2693ca52..000000000 --- a/source/de/anomic/data/URLAnalysis.java +++ /dev/null @@ -1,564 +0,0 @@ -// URLAnalysis.java -// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 24.02.2009 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -package de.anomic.data; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.net.MalformedURLException; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ConcurrentHashMap; -import java.util.regex.Pattern; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -import net.yacy.cora.document.UTF8; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.WordReferenceRow; -import net.yacy.kelondro.index.HandleMap; -import net.yacy.kelondro.index.HandleSet; -import net.yacy.kelondro.index.RowSpaceExceededException; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.order.Base64Order; -import net.yacy.kelondro.rwi.ReferenceContainerArray; -import net.yacy.kelondro.util.MemoryControl; -import net.yacy.search.index.MetadataRepository; -import net.yacy.search.index.Segment; -import net.yacy.search.index.MetadataRepository.Export; - -public class URLAnalysis { - - private static final Pattern patternMinus = Pattern.compile("-"); - - /** - * processes to analyse URL lists - */ - - private static DigestURI poison = null; - static { - try { - poison = new DigestURI("http://poison.org/poison"); - } catch (final MalformedURLException e) { - poison = null; - } - } - - public static class splitter extends Thread { - - private final ArrayBlockingQueue in; - private final ConcurrentHashMap out; - - public splitter(final ArrayBlockingQueue in, final ConcurrentHashMap out) { - this.in = in; - this.out = out; - } - - @Override - public void run() { - try { - DigestURI url; - final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_"); - while (true) { - try { - url = this.in.take(); - if (url == poison) break; - update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\.")); - update(p.matcher(url.getPath()).replaceAll("/").split("/")); - } catch (final InterruptedException e) { - Log.logException(e); - } - } - } catch (final Exception e) { - Log.logException(e); - } - } - - private void update(final String[] s) { - Integer c; - for (final String t: s) { - if (t.isEmpty()) continue; - c = this.out.get(t); - this.out.put(t, (c == null) ? 1 : c.intValue() + 1); - } - } - } - - public static void cleanup(final ConcurrentHashMap stat) { - Map.Entry entry; - int c, low = Integer.MAX_VALUE; - Iterator> i = stat.entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - c = entry.getValue().intValue(); - if (c == 1) { - i.remove(); - } else { - if (c < low) low = c; - } - } - i = stat.entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - c = entry.getValue().intValue(); - if (c == low) { - i.remove(); - } - } - Runtime.getRuntime().gc(); - } - - public static void genstat(final String urlfile) { - - final boolean gz = urlfile.endsWith(".gz"); - final String analysis = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".stats.gz" : urlfile + ".stats"; - final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8); - - // start threads - final ArrayBlockingQueue in = new ArrayBlockingQueue(1000); - final ConcurrentHashMap out = new ConcurrentHashMap(); - for (int i = 0, available = Runtime.getRuntime().availableProcessors(); i < available; i++) new splitter(in, out).start(); - final splitter spl = new splitter(in, out); - spl.start(); - - // put urls in queue - final File infile = new File(urlfile); - final File outfile = new File(analysis); - BufferedReader reader = null; - long time = System.currentTimeMillis(); - final long start = time; - int count = 0; - - System.out.println("start processing"); - try { - InputStream is = new BufferedInputStream(new FileInputStream(infile)); - if (gz) is = new GZIPInputStream(is); - reader = new BufferedReader(new InputStreamReader(is)); - String line; - while ((line = reader.readLine()) != null) { - line = line.trim(); - if (line.length() > 0) { - try { - final DigestURI url = new DigestURI(line); - in.put(url); - } catch (final InterruptedException e) { - Log.logException(e); - } catch (final MalformedURLException e) { - continue; - } - } - count++; - if (System.currentTimeMillis() - time > 1000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second"); - if (MemoryControl.available() < cleanuplimit) { - System.out.println("starting cleanup, " + out.size() + " entries in statistic"); - cleanup(out); - System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); - } - } - } - reader.close(); - } catch (final IOException e) { - Log.logException(e); - } finally { - if (reader != null) try { reader.close(); } catch (final Exception e) {} - } - - // stop threads - System.out.println("stopping threads"); - for (int i = 0, available = Runtime.getRuntime().availableProcessors() + 1; i < available; i++) try { - in.put(poison); - } catch (final InterruptedException e) { - Log.logException(e); - } - try { - spl.join(); - } catch (final InterruptedException e1) { - Log.logException(e1); - } - - // generate statistics - System.out.println("start processing results"); - final TreeMap results = new TreeMap(); - count = 0; - Map.Entry entry; - final Iterator> i = out.entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - results.put(num(entry.getValue().intValue() * (entry.getKey().length() - 1)) + " - " + entry.getKey(), entry.getValue()); - count++; - i.remove(); // free memory - if (System.currentTimeMillis() - time > 10000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); - } - } - - // write statistics - System.out.println("start writing results"); - try { - OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile)); - if (gz) os = new GZIPOutputStream(os); - count = 0; - for (final Map.Entry e: results.entrySet()) { - os.write(UTF8.getBytes(e.getKey())); - os.write(new byte[]{'\t'}); - os.write(UTF8.getBytes(Integer.toString(e.getValue()))); - os.write(new byte[]{'\n'}); - count++; - if (System.currentTimeMillis() - time > 10000) { - time = System.currentTimeMillis(); - System.out.println("wrote " + count + " lines."); - } - } - os.close(); - } catch (final IOException e) { - Log.logException(e); - } - - System.out.println("finished"); - } - - public static void genhost(final String urlfile) { - - final boolean gz = urlfile.endsWith(".gz"); - final String trunk = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".host" : urlfile + ".host"; - final HashSet hosts = new HashSet(); - final File infile = new File(urlfile); - BufferedReader reader = null; - long time = System.currentTimeMillis(); - final long start = time; - int count = 0; - - System.out.println("start processing"); - try { - InputStream is = new BufferedInputStream(new FileInputStream(infile)); - if (gz) is = new GZIPInputStream(is); - reader = new BufferedReader(new InputStreamReader(is)); - String line; - while ((line = reader.readLine()) != null) { - line = line.trim(); - if (line.length() > 0) { - try { - final DigestURI url = new DigestURI(line); - hosts.add(url.getHost()); - } catch (final MalformedURLException e) { - continue; - } - } - count++; - if (System.currentTimeMillis() - time > 1000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second"); - } - } - reader.close(); - } catch (final IOException e) { - Log.logException(e); - } finally { - if (reader != null) try { reader.close(); } catch (final Exception e) {} - } - - // copy everything into a TreeSet to order it - System.out.println("start processing results"); - final TreeSet results = new TreeSet(); - count = 0; - final Iterator i = hosts.iterator(); - while (i.hasNext()) { - results.add(i.next()); - count++; - i.remove(); // free memory - if (System.currentTimeMillis() - time > 10000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); - } - } - - // write hosts - writeSet(trunk, gz, results); - - System.out.println("finished"); - } - - private static void writeSet(final String trunk, final boolean gz, final Set set) { - - // write hosts - System.out.println("start writing results"); - final File outfile = new File(trunk + ((gz) ? ".gz" : "")); - long time = System.currentTimeMillis(); - try { - OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile)); - if (gz) os = new GZIPOutputStream(os); - int count = 0; - for (final String h: set) { - os.write(UTF8.getBytes(h)); - os.write(new byte[]{'\n'}); - count++; - if (System.currentTimeMillis() - time > 10000) { - time = System.currentTimeMillis(); - System.out.println("wrote " + count + " lines."); - } - } - os.close(); - } catch (final IOException e) { - Log.logException(e); - } - - System.out.println("finished writing results"); - } - - public static void sortsplit(final String urlfile) { - - final boolean gz = urlfile.endsWith(".gz"); - final String trunk = ((gz) ? urlfile.substring(0, urlfile.length() - 3) : urlfile) + ".sort"; - final File infile = new File(urlfile); - final TreeSet urls = new TreeSet(); - BufferedReader reader = null; - long time = System.currentTimeMillis(); - final long start = time; - int count = 0; - int filecount = 0; - final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8); - - System.out.println("start processing"); - try { - InputStream is = new BufferedInputStream(new FileInputStream(infile)); - if (gz) is = new GZIPInputStream(is); - reader = new BufferedReader(new InputStreamReader(is)); - String line; - while ((line = reader.readLine()) != null) { - line = line.trim(); - if (line.length() > 0) { - try { - final DigestURI url = new DigestURI(line); - urls.add(url.toNormalform(true, true)); - } catch (final MalformedURLException e) { - continue; - } - } - count++; - if (System.currentTimeMillis() - time > 1000) { - time = System.currentTimeMillis(); - System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second"); - } - if (MemoryControl.available() < cleanuplimit) { - writeSet(trunk + "." + filecount, gz, urls); - filecount++; - urls.clear(); - Runtime.getRuntime().gc(); - } - } - reader.close(); - } catch (final IOException e) { - Log.logException(e); - } finally { - if (reader != null) try { reader.close(); } catch (final Exception e) {} - } - - // write hosts - writeSet(trunk + "." + filecount, gz, urls); - - System.out.println("finished"); - } - - public static void incell(final File cellPath, final String statisticPath) { - try { - final HandleMap idx = ReferenceContainerArray.referenceHashes( - cellPath, - Segment.wordReferenceFactory, - Base64Order.enhancedCoder, - WordReferenceRow.urlEntryRow); - System.out.println("INDEX REFERENCE COLLECTION starting dump of statistics"); - idx.dump(new File(statisticPath)); - System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath); - idx.close(); - } catch (final Exception e) { - Log.logException(e); - } - } - - public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException { - System.out.println("INDEX DIFF URL-COL startup"); - final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile)); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); - mr.connectUrlDb(Segment.UrlDbName, false, false); - final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000); - System.out.println("INDEX DIFF URL-COL loaded dump, starting diff"); - final long start = System.currentTimeMillis(); - long update = start - 7000; - int count = 0; - for (final byte[] refhash: mr) { - if (idx.get(refhash) == -1) { - // the key exists as urlhash in the URL database, but not in the collection as referenced urlhash - hs.put(refhash); - } - count++; - if (System.currentTimeMillis() - update > 10000) { - System.out.println("INDEX DIFF URL-COL running, checked " + count + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - count) / count) / 60000) + " minutes remaining"); - update = System.currentTimeMillis(); - } - } - idx.close(); - mr.close(); - System.out.println("INDEX DIFF URL-COL finished diff, starting dump to " + diffFile); - count = hs.dump(new File(diffFile)); - System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump"); - return count; - } - - public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException { - // format: 0=text, 1=html, 2=rss/xml - System.out.println("URL EXPORT startup"); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); - mr.connectUrlDb(Segment.UrlDbName, false, false); - final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile)); - System.out.println("URL EXPORT loaded dump, starting export"); - final Export e = mr.export(new File(export), ".*", hs, format, false); - try { - e.join(); - } catch (final InterruptedException e1) { - Log.logException(e1); - } - System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries"); - } - - public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException { - System.out.println("URL DELETE startup"); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); - mr.connectUrlDb(Segment.UrlDbName, false, false); - final int mrSize = mr.size(); - final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile)); - System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize); - for (final byte[] refhash: hs) { - mr.remove(refhash); - } - System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database"); - } - - public static void main(final String[] args) { - if (args[0].equals("-stat") && args.length >= 2) { - // generate a statistics about common words in file, store to .stat - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz - for (int i = 1; i < args.length; i++) genstat(args[i]); - } else if (args[0].equals("-host") && args.length >= 2) { - // generate a file .host containing only the hosts of the urls - for (int i = 1; i < args.length; i++) genhost(args[i]); - } else if (args[0].equals("-sort") && args.length >= 2) { - // generate file .x.sort with sorted lists and split the file in smaller pieces - for (int i = 1; i < args.length; i++) sortsplit(args[i]); - } else if (args[0].equals("-incell") && args.length >= 2) { - // generate a dump of all referenced URL hashes from a given RICELL - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incell DATA/INDEX/freeworld/TEXT/RICELL used.dump - incell(new File(args[1]), args[2]); - } else if (args[0].equals("-diffurlcol") && args.length >= 3) { - // make a diff-file that contains hashes from the url database that do not occur in the collection reference dump - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump - try { - diffurlcol(args[1], args[2], args[3]); - } catch (final Exception e) { - Log.logException(e); - } - } else if (args[0].equals("-export") && args.length >= 4) { - // export a url-list file - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump - // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html' - final int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0; - try { - export(args[1], format, args[3], (args.length >= 5) ? args[4] : null); - } catch (final Exception e) { - Log.logException(e); - } - } else if (args[0].equals("-delete") && args.length >= 3) { - // delete from URLs as given by urlreference diff dump - // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump - // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html' - try { - delete(args[1], args[2]); - } catch (final Exception e) { - Log.logException(e); - } - } else { - System.out.println("usage:"); - System.out.println(); - System.out.println("-stat "); - System.out.println(" generate a statistics about common words in file, store to .stat"); - System.out.println(); - System.out.println("-host "); - System.out.println(" generate a file .host containing only the hosts of the urls"); - System.out.println(); - System.out.println("-sort "); - System.out.println(" generate file .x.sort with sorted lists and split the file in smaller pieces"); - System.out.println(); - System.out.println("-incollection "); - System.out.println(" generate a dump of all referenced URL hashes"); - System.out.println(); - System.out.println("-diffurlcol "); - System.out.println(" find URLs that occur in url-db but not in collections"); - System.out.println(); - System.out.println("-export "); - System.out.println(" export urls to file. the last argument can be omitted, then all urls are exported"); - System.out.println(); - System.out.println("-delete "); - System.out.println(" delete all urls that are listed in the diff-dump from the url-db"); - System.out.println(); - System.out.println("to do a complete clean-up of the url database, start the following:"); - System.out.println(); - System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump"); - System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump"); - System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump"); - System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump"); - System.out.println(); - } - System.exit(0); // kill remaining threads - } - - private static final String num(final int i) { - final StringBuilder s = new StringBuilder(Integer.toString(i)); - while (s.length() < 9) s.insert(0, "0"); - return s.toString(); - } -} diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index d872885a4..fd458f54c 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -33,21 +33,17 @@ import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.order.CloneableIterator; -import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.services.federated.solr.DoubleSolrConnector; import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ScoreMap; -import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; @@ -61,15 +57,11 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.util.MemoryControl; -import net.yacy.repository.Blacklist; -import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.search.solr.EmbeddedSolrConnector; import org.apache.lucene.util.Version; -import de.anomic.crawler.CrawlStacker; - public final class MetadataRepository implements /*Metadata,*/ Iterable { // class objects @@ -186,26 +178,20 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.solr.close(); } - public int writeCacheSize() { - if (this.urlIndexFile != null && this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize(); - if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize(); - return 0; - } - /** * generates an plasmaLURLEntry using the url hash * if the url cannot be found, this returns null * @param obrwi * @return */ - public URIMetadata load(final WeakPriorityBlockingQueue.Element obrwi) { - if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element - final byte[] urlHash = obrwi.getElement().urlhash(); + public URIMetadata load(WordReferenceVars wre, long weight) { + if (wre == null) return null; // all time was already wasted in takeRWI to get another element + final byte[] urlHash = wre.urlhash(); if (urlHash == null) return null; if (this.urlIndexFile != null) try { final Row.Entry entry = this.urlIndexFile.get(urlHash, false); if (entry == null) return null; - return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight()); + return new URIMetadataRow(entry, wre, weight); } catch (final IOException e) { Log.logException(e); } @@ -280,29 +266,25 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable public boolean exists(final byte[] urlHash) { if (urlHash == null) return false; + if (this.urlIndexFile != null && this.urlIndexFile.has(urlHash)) return true; try { if (this.solr.exists(ASCII.String(urlHash))) return true; } catch (final Throwable e) { Log.logException(e); } - if (this.urlIndexFile == null) return false; // case may happen during shutdown - return this.urlIndexFile.has(urlHash); + return false; } - public CloneableIterator keys(final boolean up, final byte[] firstKey) { + @Override + public Iterator iterator() { try { - return this.urlIndexFile.keys(up, firstKey); + return this.urlIndexFile.keys(true, null); } catch (final IOException e) { Log.logException(e); return null; } } - @Override - public Iterator iterator() { - return keys(true, null); - } - public CloneableIterator entries() throws IOException { // enumerates entry elements return new kiter(); @@ -367,186 +349,6 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable } } - - /** - * Uses an Iteration over urlHash.db to detect malformed URL-Entries. - * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function. - * - * @param proxyConfig - */ - public void deadlinkCleaner() { - final Log log = new Log("URLDBCLEANUP"); - final HashSet damagedURLS = new HashSet(); - try { - final Iterator eiter = entries(true, null); - int iteratorCount = 0; - while (eiter.hasNext()) try { - eiter.next(); - iteratorCount++; - } catch (final RuntimeException e) { - if(e.getMessage() != null) { - final String m = e.getMessage(); - damagedURLS.add(m.substring(m.length() - 12)); - } else { - log.logSevere("RuntimeException:", e); - } - } - log.logInfo("URLs vorher: " + this.urlIndexFile.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size()); - - final HTTPClient client = new HTTPClient(); - final Iterator eiter2 = damagedURLS.iterator(); - byte[] urlHashBytes; - while (eiter2.hasNext()) { - urlHashBytes = ASCII.getBytes(eiter2.next()); - - // trying to fix the invalid URL - String oldUrlStr = null; - try { - // getting the url data as byte array - final Row.Entry entry = this.urlIndexFile.get(urlHashBytes, true); - - // getting the wrong url string - oldUrlStr = entry.getColUTF8(1).trim(); - - int pos = -1; - if ((pos = oldUrlStr.indexOf("://",0)) != -1) { - // trying to correct the url - final String newUrlStr = "http://" + oldUrlStr.substring(pos + 3); - final DigestURI newUrl = new DigestURI(newUrlStr); - - if (client.HEADResponse(newUrl.toString()) != null - && client.getHttpResponse().getStatusLine().getStatusCode() == 200) { - entry.setCol(1, UTF8.getBytes(newUrl.toString())); - this.urlIndexFile.put(entry); - if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr); - } else { - remove(urlHashBytes); - if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (client.getHttpResponse() == null ? "null" : client.getHttpResponse().getStatusLine())); - } - } - } catch (final Exception e) { - remove(urlHashBytes); - if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage()); - } - } - - log.logInfo("URLs nachher: " + size() + " kaputte URLs: " + damagedURLS.size()); - } catch (final IOException e) { - log.logSevere("IOException", e); - } - } - - public BlacklistCleaner getBlacklistCleaner(final Blacklist blacklist, final CrawlStacker crawlStacker) { - return new BlacklistCleaner(blacklist, crawlStacker); - } - - public class BlacklistCleaner extends Thread { - - private boolean run = true; - private boolean pause; - public int blacklistedUrls = 0; - public int totalSearchedUrls = 1; - public String lastBlacklistedUrl = ""; - public String lastBlacklistedHash = ""; - public String lastUrl = ""; - public String lastHash = ""; - private final Blacklist blacklist; - private final CrawlStacker crawlStacker; - - public BlacklistCleaner(final Blacklist blacklist, final CrawlStacker crawlStacker) { - this.blacklist = blacklist; - this.crawlStacker = crawlStacker; - } - - @Override - public void run() { - try { - Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet"); - final Iterator eiter = entries(true, null); - while (eiter.hasNext() && this.run) { - synchronized (this) { - if (this.pause) { - try { - this.wait(); - } catch (final InterruptedException e) { - Log.logWarning("URLDBCLEANER", "InterruptedException", e); - this.run = false; - return; - } - } - } - final URIMetadata entry = eiter.next(); - if (entry == null) { - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null"); - } else if (entry.hash() == null) { - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + "hash == null"); - } else { - this.totalSearchedUrls++; - if (entry.url() == null) { - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + "URL == null"); - remove(entry.hash()); - continue; - } - if (this.blacklist.isListed(BlacklistType.CRAWLER, entry) || - this.blacklist.isListed(BlacklistType.DHT, entry) || - (this.crawlStacker.urlInAcceptedDomain(entry.url()) != null)) { - this.lastBlacklistedUrl = entry.url().toNormalform(true, true); - this.lastBlacklistedHash = ASCII.String(entry.hash()); - if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + " " + entry.url().toNormalform(false, true)); - remove(entry.hash()); - if (this.blacklistedUrls % 100 == 0) { - Log.logInfo("URLDBCLEANER", "Deleted " + this.blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + this.lastBlacklistedUrl); - } - } - this.lastUrl = entry.url().toNormalform(true, true); - this.lastHash = ASCII.String(entry.hash()); - } - } - } catch (final RuntimeException e) { - if (e.getMessage() != null && e.getMessage().indexOf("not found in LURL",0) != -1) { - Log.logWarning("URLDBCLEANER", "urlHash not found in LURL", e); - } - else { - Log.logWarning("URLDBCLEANER", "RuntimeException", e); - this.run = false; - } - } catch (final IOException e) { - Log.logException(e); - this.run = false; - } catch (final Exception e) { - Log.logException(e); - this.run = false; - } - Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread stopped"); - } - - public void abort() { - synchronized(this) { - this.run = false; - notifyAll(); - } - } - - public void pause() { - synchronized(this) { - if (!this.pause) { - this.pause = true; - Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread paused"); - } - } - } - - public void endPause() { - synchronized(this) { - if (this.pause) { - this.pause = false; - notifyAll(); - Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread resumed"); - } - } - } - } - // export methods public Export export(final File f, final String filter, final HandleSet set, final int format, final boolean dom) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index f3c16058d..51b7a3ee4 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -33,7 +33,6 @@ import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.Set; -import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; @@ -55,7 +54,6 @@ import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceRow; -import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; @@ -66,7 +64,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.LookAheadIterator; -import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; import net.yacy.search.query.RWIProcess; @@ -252,7 +249,7 @@ public class Segment { * @param host * @return an iterator for all url hashes that belong to a specific host */ - public Iterator hostSelector(String host) { + private Iterator hostSelector(String host) { String hh = DigestURI.hosthash(host); final HandleSet ref = new HandleSet(12, Base64Order.enhancedCoder, 100); for (byte[] b: this.urlMetadata) { @@ -551,12 +548,6 @@ public class Segment { return newEntry; } - - // method for index deletion - public int removeAllUrlReferences(final DigestURI url, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { - return removeAllUrlReferences(url.hash(), loader, cacheStrategy); - } - public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { for (final byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy); } @@ -604,129 +595,4 @@ public class Segment { } } - - // The Cleaner class was provided as "UrldbCleaner" by Hydrox - public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) { - return new ReferenceCleaner(startHash); - } - - public class ReferenceCleaner extends Thread { - - private final byte[] startHash; - private boolean run = true; - private boolean pause = false; - public int rwiCountAtStart = 0; - public byte[] wordHashNow = null; - public byte[] lastWordHash = null; - public int lastDeletionCounter = 0; - - public ReferenceCleaner(final byte[] startHash) { - this.startHash = startHash; - this.rwiCountAtStart = termIndex().sizesMax(); - } - - @Override - public void run() { - Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); - ReferenceContainer container = null; - WordReferenceVars entry = null; - DigestURI url = null; - final HandleSet urlHashs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); - try { - Iterator> indexContainerIterator = Segment.this.termIndex.referenceContainer(this.startHash, false, false, 100, false).iterator(); - while (indexContainerIterator.hasNext() && this.run) { - waiter(); - container = indexContainerIterator.next(); - final Iterator containerIterator = container.entries(); - this.wordHashNow = container.getTermHash(); - while (containerIterator.hasNext() && this.run) { - waiter(); - entry = new WordReferenceVars(containerIterator.next()); - // System.out.println("Wordhash: "+wordHash+" UrlHash: - // "+entry.getUrlHash()); - final URIMetadata ue = Segment.this.urlMetadata.load(entry.urlhash()); - if (ue == null) { - urlHashs.put(entry.urlhash()); - } else { - url = ue.url(); - if (url == null || Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) { - urlHashs.put(entry.urlhash()); - } - } - } - if (!urlHashs.isEmpty()) try { - final int removed = Segment.this.termIndex.remove(container.getTermHash(), urlHashs); - Log.logFine("INDEXCLEANER", ASCII.String(container.getTermHash()) + ": " + removed + " of " + container.size() + " URL-entries deleted"); - this.lastWordHash = container.getTermHash(); - this.lastDeletionCounter = urlHashs.size(); - urlHashs.clear(); - } catch (final IOException e) { - Log.logException(e); - } - - if (!containerIterator.hasNext()) { - // We may not be finished yet, try to get the next chunk of wordHashes - final TreeSet> containers = Segment.this.termIndex.referenceContainer(container.getTermHash(), false, false, 100, false); - indexContainerIterator = containers.iterator(); - // Make sure we don't get the same wordhash twice, but don't skip a word - if ((indexContainerIterator.hasNext()) && (!container.getTermHash().equals(indexContainerIterator.next().getTermHash()))) { - indexContainerIterator = containers.iterator(); - } - } - } - } catch (final IOException e) { - Log.logException(e); - } catch (final Exception e) { - Log.logException(e); - } - Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread stopped"); - } - - public void abort() { - synchronized(this) { - this.run = false; - notifyAll(); - } - } - - public void pause() { - synchronized (this) { - if (!this.pause) { - this.pause = true; - Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread paused"); - } - } - } - - public void endPause() { - synchronized (this) { - if (this.pause) { - this.pause = false; - notifyAll(); - Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread resumed"); - } - } - } - - public void waiter() { - synchronized (this) { - if (this.pause) { - try { - this.wait(); - } catch (final InterruptedException e) { - this.run = false; - return; - } - } - } - } - - public int rwisize() { - return termIndex().sizesMax(); - } - - public int urlsize() { - return urlMetadata().size(); - } - } } diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index 3248598d8..93e975e9e 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -628,7 +628,7 @@ public final class RWIProcess extends Thread if ( obrwi == null ) { return null; // all time was already wasted in takeRWI to get another element } - final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi); + final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi.getElement(), obrwi.getWeight()); if ( page == null ) { try { this.misses.putUnique(obrwi.getElement().urlhash()); diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index d10281b63..b9e44cd7e 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -24,7 +24,6 @@ package net.yacy; -import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; @@ -38,54 +37,33 @@ import java.io.PrintWriter; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import java.nio.channels.FileLock; -import java.util.Iterator; -import java.util.Map; import java.util.Properties; -import java.util.TreeMap; -import java.util.TreeSet; import java.util.concurrent.Semaphore; -import java.util.zip.ZipEntry; -import java.util.zip.ZipOutputStream; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.sorting.Array; -import net.yacy.cora.sorting.OrderedScoreMap; -import net.yacy.cora.sorting.ScoreMap; import net.yacy.gui.YaCyApp; import net.yacy.gui.framework.Browser; -import net.yacy.kelondro.blob.MapDataMining; -import net.yacy.kelondro.data.meta.URIMetadata; -import net.yacy.kelondro.data.word.Word; -import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.order.Base64Order; -import net.yacy.kelondro.rwi.Reference; -import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.Formatter; import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.OS; -import net.yacy.peers.SeedDB; import net.yacy.peers.operation.yacyBuildProperties; import net.yacy.peers.operation.yacyRelease; import net.yacy.peers.operation.yacyVersion; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; -import net.yacy.search.index.MetadataRepository; -import net.yacy.search.index.Segment; import com.google.common.io.Files; import de.anomic.data.Translator; import de.anomic.http.server.HTTPDemon; import de.anomic.server.serverCore; -import de.anomic.tools.enumerateFiles; /** * This is the main class of YaCy. Several threads are started from here: @@ -595,346 +573,6 @@ public final class yacy { Log.logConfig("COMMAND-STEERING", "SUCCESSFULLY FINISHED COMMAND: " + processdescription); } - /** - * This method gets all found words and outputs a statistic about the score - * of the words. The output of this method can be used to create stop-word - * lists. This method will be called if you start yacy with the argument - * -genwordstat. - * FIXME: How can stop-word list be created from this output? What type of - * score is output? - * - * @param homePath Root-Path where all the information is to be found. - */ - private static void genWordstat(final File homePath) { - // start up - System.out.println(copyright); - System.out.println(hline); - - // load words - Log.logInfo("GEN-WORDSTAT", "loading words..."); - final TreeMap words = loadWordMap(new File(homePath, "yacy.words")); - - // find all hashes - Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases..."); - final File dbRoot = new File(homePath, "DATA/INDEX/freeworld/"); - final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true); - File f; - byte[] h; - final ScoreMap hs = new OrderedScoreMap(Base64Order.standardCoder); - while (ef.hasMoreElements()) { - f = ef.nextElement(); - h = f.getName().substring(0, Word.commonHashLength).getBytes(); - hs.inc(h, (int) f.length()); - } - - // list the hashes in reverse order - Log.logInfo("GEN-WORDSTAT", "listing words in reverse size order..."); - String w; - final Iterator i = hs.keys(false); - while (i.hasNext()) { - h = i.next(); - w = words.get(h); - if (w == null) System.out.print("# " + h); else System.out.print(w); - System.out.println(" - " + hs.get(h)); - } - - // finished - Log.logConfig("GEN-WORDSTAT", "FINISHED"); - } - - /** - * @param homePath path to the YaCy directory - * @param networkName - */ - public static void minimizeUrlDB(final File dataHome, final File appHome, final String networkName) { - // run with "java -classpath classes yacy -minimizeUrlDB" - try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX"); - final File indexRoot2 = new File(dataHome, "DATA/INDEX2"); - final Log log = new Log("URL-CLEANUP"); - try { - log.logInfo("STARTING URL CLEANUP"); - - // db containing all currently loades urls - final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT")); - currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false); - - // db used to hold all neede urls - final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT")); - minimizedUrlDB.connectUrlDb(Segment.UrlDbName, false, false); - - final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total()); - if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); - - final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null); - wordIndex.connectRWI(10000, Integer.MAX_VALUE); - wordIndex.connectUrlDb(false, false); - final Iterator> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false); - - long urlCounter = 0, wordCounter = 0; - long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0; - String wordChunkStartHash = "AAAAAAAAAAAA", wordChunkEndHash; - - while (indexContainerIterator.hasNext()) { - ReferenceContainer wordIdxContainer = null; - try { - wordCounter++; - wordIdxContainer = indexContainerIterator.next(); - - // the combined container will fit, read the container - final Iterator wordIdxEntries = wordIdxContainer.entries(); - Reference iEntry; - while (wordIdxEntries.hasNext()) { - iEntry = wordIdxEntries.next(); - final byte[] urlHash = iEntry.urlhash(); - if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - final URIMetadata urlEntry = currentUrlDB.load(urlHash); - urlCounter++; - minimizedUrlDB.store(urlEntry); - if (urlCounter % 500 == 0) { - log.logInfo(urlCounter + " URLs found so far."); - } - } catch (final IOException e) {} - } - - if (wordCounter%500 == 0) { - wordChunkEndHash = ASCII.String(wordIdxContainer.getTermHash()); - wordChunkEnd = System.currentTimeMillis(); - final long duration = wordChunkEnd - wordChunkStart; - log.logInfo(wordCounter + " words scanned " + - "[" + wordChunkStartHash + " .. " + wordChunkEndHash + "]\n" + - "Duration: "+ 500*1000/duration + " words/s" + - " | Free memory: " + MemoryControl.free() + - " | Total memory: " + MemoryControl.total()); - wordChunkStart = wordChunkEnd; - wordChunkStartHash = wordChunkEndHash; - } - - // we have read all elements, now we can close it - wordIdxContainer = null; - - } catch (final Exception e) { - log.logSevere("Exception", e); - } finally { - if (wordIdxContainer != null) try { wordIdxContainer = null; } catch (final Exception e) {} - } - } - log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries."); - log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries."); - - currentUrlDB.close(); - minimizedUrlDB.close(); - wordIndex.close(); - - // TODO: rename the mimimized UrlDB to the name of the previous UrlDB - - log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP"); - log.logInfo("You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db"); - - log.logInfo("TERMINATED URL CLEANUP"); - } catch (final Exception e) { - log.logSevere("Exception: " + e.getMessage(), e); - } catch (final Error e) { - log.logSevere("Error: " + e.getMessage(), e); - } - } - - /** - * Reads all words from the given file and creates a treemap, where key is - * the plasma word hash and value is the word itself. - * - * @param wordlist File where the words are stored. - * @return HashMap with the hash-word - relation. - */ - private static TreeMap loadWordMap(final File wordlist) { - // returns a hash-word - Relation - final TreeMap wordmap = new TreeMap(Base64Order.enhancedCoder); - try { - String word; - final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist))); - while ((word = br.readLine()) != null) wordmap.put(Word.word2hash(word), word); - br.close(); - } catch (final IOException e) {} - return wordmap; - } - - /** - * Cleans a wordlist in a file according to the length of the words. The - * file with the given filename is read and then only the words in the given - * length-range are written back to the file. - * - * @param wordlist Name of the file the words are stored in. - * @param minlength Minimal needed length for each word to be stored. - * @param maxlength Maximal allowed length for each word to be stored. - */ - private static void cleanwordlist(final String wordlist, final int minlength, final int maxlength) { - // start up - System.out.println(copyright); - System.out.println(hline); - Log.logConfig("CLEAN-WORDLIST", "START"); - - String word; - final TreeSet wordset = new TreeSet(); - int count = 0; - try { - final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist))); - final String seps = "' .,:/-&"; - while ((word = br.readLine()) != null) { - word = word.toLowerCase().trim(); - for (int i = 0; i < seps.length(); i++) { - if (word.indexOf(seps.charAt(i)) >= 0) word = word.substring(0, word.indexOf(seps.charAt(i))); - } - if ((word.length() >= minlength) && (word.length() <= maxlength)) wordset.add(word); - count++; - } - br.close(); - - if (wordset.size() != count) { - count = count - wordset.size(); - final BufferedWriter bw = new BufferedWriter(new PrintWriter(new FileWriter(wordlist))); - while (!wordset.isEmpty()) { - word = wordset.first(); - bw.write(word + "\n"); - wordset.remove(word); - } - bw.close(); - Log.logInfo("CLEAN-WORDLIST", "shrinked wordlist by " + count + " words."); - } else { - Log.logInfo("CLEAN-WORDLIST", "not necessary to change wordlist"); - } - } catch (final IOException e) { - Log.logSevere("CLEAN-WORDLIST", "ERROR: " + e.getMessage()); - System.exit(-1); - } - - // finished - Log.logConfig("CLEAN-WORDLIST", "FINISHED"); - } - - private static String[] shift(final String[] args, final int pos, final int count) { - final String[] newargs = new String[args.length - count]; - System.arraycopy(args, 0, newargs, 0, pos); - System.arraycopy(args, pos + count, newargs, pos, args.length - pos - count); - return newargs; - } - - /** - * Uses an Iteration over urlHash.db to detect malformed URL-Entries. - * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function. - * - * @param homePath Root-Path where all information is to be found. - */ - private static void urldbcleanup(final File dataHome, final File appHome, final String networkName) { - final File root = dataHome; - final File indexroot = new File(root, "DATA/INDEX"); - try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT")); - currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false); - currentUrlDB.deadlinkCleaner(); - currentUrlDB.close(); - } - - private static void RWIHashList(final File dataHome, final File appHome, final String targetName, final String resource, final String format) { - Segment WordIndex = null; - final Log log = new Log("HASHLIST"); - final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX"); - final String wordChunkStartHash = "AAAAAAAAAAAA"; - try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - log.logInfo("STARTING CREATION OF RWI-HASHLIST"); - final File root = dataHome; - try { - Iterator> indexContainerIterator = null; - if (resource.equals("all")) { - WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null); - WordIndex.connectRWI(10000, Integer.MAX_VALUE); - WordIndex.connectUrlDb(false, false); - indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false); - } - int counter = 0; - ReferenceContainer container = null; - if (format.equals("zip")) { - log.logInfo("Writing Hashlist to ZIP-file: " + targetName + ".zip"); - final ZipEntry zipEntry = new ZipEntry(targetName + ".txt"); - final File file = new File(root, targetName + ".zip"); - final ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file)); - bos.putNextEntry(zipEntry); - if(indexContainerIterator != null) { - while (indexContainerIterator.hasNext()) { - counter++; - container = indexContainerIterator.next(); - bos.write(container.getTermHash()); - bos.write(serverCore.CRLF); - if (counter % 500 == 0) { - log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash())); - } - } - } - bos.flush(); - bos.close(); - } else { - log.logInfo("Writing Hashlist to TXT-file: " + targetName + ".txt"); - final File file = new File(root, targetName + ".txt"); - final BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); - if(indexContainerIterator != null) { - while (indexContainerIterator.hasNext()) { - counter++; - container = indexContainerIterator.next(); - bos.write(container.getTermHash()); - bos.write(serverCore.CRLF); - if (counter % 500 == 0) { - log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash())); - } - } - } - bos.flush(); - bos.close(); - } - log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : ASCII.String(container.getTermHash()))); - } catch (final IOException e) { - log.logSevere("IOException", e); - } - if (WordIndex != null) { - WordIndex.close(); - WordIndex = null; - } - } - - /** - * Searching for peers affected by Bug - * @param homePath - */ - public static void testPeerDB(final File homePath) { - - try { - final File yacyDBPath = new File(homePath, "DATA/INDEX/freeworld/NETWORK"); - - final String[] dbFileNames = {"seed.new.db","seed.old.db","seed.pot.db"}; - for (final String dbFileName : dbFileNames) { - final File dbFile = new File(yacyDBPath,dbFileName); - final MapDataMining db = new MapDataMining(dbFile, Word.commonHashLength, Base64Order.enhancedCoder, 1024 * 512, 500, SeedDB.sortFields, SeedDB.longaccFields, SeedDB.doubleaccFields); - - Iterator>> it; - it = db.entries(true, false); - while (it.hasNext()) { - final Map.Entry> dna = it.next(); - String peerHash = UTF8.String(dna.getKey()); - if (peerHash.length() < Word.commonHashLength) { - final String peerName = dna.getValue().get("Name"); - final String peerIP = dna.getValue().get("IP"); - final String peerPort = dna.getValue().get("Port"); - - while (peerHash.length() < Word.commonHashLength) { peerHash = peerHash + "_"; } - System.err.println("Invalid Peer-Hash found in '" + dbFileName + "': " + peerName + ":" + peerHash + ", http://" + peerIP + ":" + peerPort); - } - } - db.close(); - } - } catch (final Exception e) { - Log.logException(e); - } - } - - /** * Main-method which is started by java. Checks for special arguments or * starts up the application. @@ -993,46 +631,6 @@ public final class yacy { } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-version"))) { // show yacy version System.out.println(copyright); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) { - // migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible - // attention: this may run long and should not be interrupted! - if (args.length >= 3 && args[1].toLowerCase().equals("-cache")) { - args = shift(args, 1, 2); - } - if (args.length == 2) applicationRoot= new File(args[1]); - minimizeUrlDB(dataRoot, applicationRoot, "freeworld"); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-testpeerdb"))) { - if (args.length == 2) { - applicationRoot = new File(args[1]); - } else if (args.length > 2) { - System.err.println("Usage: -testPeerDB [homeDbRoot]"); - } - testPeerDB(applicationRoot); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-genwordstat"))) { - // this can help to create a stop-word list - // to use this, you need a 'yacy.words' file in the root path - // start this with "java -classpath classes yacy -genwordstat []" - if (args.length == 2) applicationRoot= new File(args[1]); - genWordstat(applicationRoot); - } else if ((args.length == 4) && (args[0].toLowerCase().equals("-cleanwordlist"))) { - // this can be used to organize and clean a word-list - // start this with "java -classpath classes yacy -cleanwordlist " - final int minlength = Integer.parseInt(args[2]); - final int maxlength = Integer.parseInt(args[3]); - cleanwordlist(args[1], minlength, maxlength); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { - // generate a url list and save it in a file - if (args.length == 2) applicationRoot= new File(args[1]); - urldbcleanup(dataRoot, applicationRoot, "freeworld"); - } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-rwihashlist"))) { - // generate a url list and save it in a file - String domain = "all"; - String format = "txt"; - if (args.length >= 2) domain= args[1]; - if (args.length >= 3) format= args[2]; - if (args.length == 4) applicationRoot= new File(args[3]); - final String outfile = "rwihashlist_" + System.currentTimeMillis(); - RWIHashList(dataRoot, applicationRoot, outfile, domain, format); } else { if (args.length == 1) applicationRoot= new File(args[0]); startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, false);