diff --git a/htroot/IndexCleaner_p.java b/htroot/IndexCleaner_p.java
deleted file mode 100644
index c30ea0e7c..000000000
--- a/htroot/IndexCleaner_p.java
+++ /dev/null
@@ -1,108 +0,0 @@
-//-----------------------
-//part of the AnomicHTTPD caching proxy
-//(C) by Michael Peter Christen; mc@yacy.net
-//first published on http://www.anomic.de
-//Frankfurt, Germany, 2005
-//
-//This file is contributed by Matthias Soehnholz
-//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
-//This program is free software; you can redistribute it and/or modify
-//it under the terms of the GNU General Public License as published by
-//the Free Software Foundation; either version 2 of the License, or
-//(at your option) any later version.
-//
-//This program is distributed in the hope that it will be useful,
-//but WITHOUT ANY WARRANTY; without even the implied warranty of
-//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-//GNU General Public License for more details.
-//
-//You should have received a copy of the GNU General Public License
-//along with this program; if not, write to the Free Software
-//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-import net.yacy.cora.document.ASCII;
-import net.yacy.cora.protocol.RequestHeader;
-import net.yacy.search.Switchboard;
-import net.yacy.search.index.MetadataRepository;
-import net.yacy.search.index.Segment;
-import de.anomic.server.serverObjects;
-import de.anomic.server.serverSwitch;
-
-public class IndexCleaner_p {
- private static MetadataRepository.BlacklistCleaner urldbCleanerThread = null;
- private static Segment.ReferenceCleaner indexCleanerThread = null;
-
- public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
- final serverObjects prop = new serverObjects();
- final Switchboard sb = (Switchboard) env;
- prop.put("title", "DbCleanup_p");
-
- // get segment
- Segment indexSegment = sb.index;
-
- if (post!=null) {
- if (post.get("action").equals("ustart")) {
- if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
- urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist, sb.crawlStacker);
- urldbCleanerThread.start();
- }
- else {
- urldbCleanerThread.endPause();
- }
- }
- else if (post.get("action").equals("ustop") && (urldbCleanerThread!=null)) {
- urldbCleanerThread.abort();
- }
- else if (post.get("action").equals("upause") && (urldbCleanerThread!=null)) {
- urldbCleanerThread.pause();
- }
- else if (post.get("action").equals("rstart")) {
- if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
- indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes());
- indexCleanerThread.start();
- }
- else {
- indexCleanerThread.endPause();
- }
- }
- else if (post.get("action").equals("rstop") && (indexCleanerThread!=null)) {
- indexCleanerThread.abort();
- }
- else if (post.get("action").equals("rpause") && (indexCleanerThread!=null)) {
- indexCleanerThread.pause();
- }
- prop.put("LOCATION","");
- return prop;
- }
- if (urldbCleanerThread!=null) {
- prop.put("urldb", "1");
- prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100);
- prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
- prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls);
- prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);
- prop.put("urldb_lastBlacklistedHash", urldbCleanerThread.lastBlacklistedHash);
- prop.putHTML("urldb_lastUrl", urldbCleanerThread.lastUrl);
- prop.put("urldb_lastHash", urldbCleanerThread.lastHash);
- prop.put("urldb_threadAlive", Boolean.toString(urldbCleanerThread.isAlive()));
- prop.put("urldb_threadToString", urldbCleanerThread.toString());
- final double percent = ((double)urldbCleanerThread.blacklistedUrls/urldbCleanerThread.totalSearchedUrls)*100;
- prop.putNum("urldb_percent", percent);
- }
- if (indexCleanerThread!=null) {
- prop.put("rwidb", "1");
- prop.put("rwidb_threadAlive", Boolean.toString(indexCleanerThread.isAlive()));
- prop.put("rwidb_threadToString", indexCleanerThread.toString());
- prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
- prop.putNum("rwidb_RWIcountnow", indexCleanerThread.rwisize());
- prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : ASCII.String(indexCleanerThread.wordHashNow));
- prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : ASCII.String(indexCleanerThread.lastWordHash));
- prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);
-
- }
- return prop;
- }
-}
diff --git a/htroot/IndexControlCleaner_p.html b/htroot/IndexControlCleaner_p.html
deleted file mode 100644
index 7ef61758d..000000000
--- a/htroot/IndexControlCleaner_p.html
+++ /dev/null
@@ -1,108 +0,0 @@
-
-
-
- YaCy '#[clientname]#': Index Cleaner
- #%env/templates/metas.template%#
-
-
-
- #(inline)##%env/templates/header.template%#
-
-
Steering of API Actions
-
This table shows search results that had been sorted out from the search result display because their content had not been verified.
- This means that the searched word does not appear on the search page.
-
::#(/inline)#
- #(showtable)#::
-
- #(/showtable)#
-
- #%env/templates/footer.template%#
-
-
-
\ No newline at end of file
diff --git a/htroot/PerformanceQueues_p.html b/htroot/PerformanceQueues_p.html
index 2fc254a01..9793e0d4d 100644
--- a/htroot/PerformanceQueues_p.html
+++ b/htroot/PerformanceQueues_p.html
@@ -72,14 +72,6 @@
RAM Cache |
Description |
-
- URLs in RAM buffer: |
- #[urlCacheSize]# |
-
- This is the size of the URL write buffer. Its purpose is to buffer incoming URLs
- in case of search result transmission and during DHT transfer.
- |
-
Words in RAM cache: (Size in KBytes) |
#[wordCacheSize]# (#[wordCacheSizeKBytes]# KB) |
diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java
index de414a3a4..d9f50d0c3 100644
--- a/htroot/PerformanceQueues_p.java
+++ b/htroot/PerformanceQueues_p.java
@@ -299,7 +299,6 @@ public class PerformanceQueues_p {
prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
// table cache settings
- prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize());
prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize());
prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024);
prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences());
diff --git a/htroot/PerformanceQueues_p.xml b/htroot/PerformanceQueues_p.xml
index 8535e76a9..e7fc14d95 100644
--- a/htroot/PerformanceQueues_p.xml
+++ b/htroot/PerformanceQueues_p.xml
@@ -24,7 +24,6 @@
#{/table}#
- #[urlCacheSize]#
#[wordCacheSize]#
#[maxURLinCache]#
#[maxAgeOfCache]#
diff --git a/htroot/env/templates/submenuBlacklist.template b/htroot/env/templates/submenuBlacklist.template
index 6c157a9eb..a02bb6c1f 100644
--- a/htroot/env/templates/submenuBlacklist.template
+++ b/htroot/env/templates/submenuBlacklist.template
@@ -5,6 +5,5 @@
-
\ No newline at end of file
diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java
deleted file mode 100644
index c2693ca52..000000000
--- a/source/de/anomic/data/URLAnalysis.java
+++ /dev/null
@@ -1,564 +0,0 @@
-// URLAnalysis.java
-// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 24.02.2009 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
-// LICENSE
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-
-package de.anomic.data;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.net.MalformedURLException;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.regex.Pattern;
-import java.util.zip.GZIPInputStream;
-import java.util.zip.GZIPOutputStream;
-
-import net.yacy.cora.document.UTF8;
-import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.data.meta.URIMetadataRow;
-import net.yacy.kelondro.data.word.WordReferenceRow;
-import net.yacy.kelondro.index.HandleMap;
-import net.yacy.kelondro.index.HandleSet;
-import net.yacy.kelondro.index.RowSpaceExceededException;
-import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.order.Base64Order;
-import net.yacy.kelondro.rwi.ReferenceContainerArray;
-import net.yacy.kelondro.util.MemoryControl;
-import net.yacy.search.index.MetadataRepository;
-import net.yacy.search.index.Segment;
-import net.yacy.search.index.MetadataRepository.Export;
-
-public class URLAnalysis {
-
- private static final Pattern patternMinus = Pattern.compile("-");
-
- /**
- * processes to analyse URL lists
- */
-
- private static DigestURI poison = null;
- static {
- try {
- poison = new DigestURI("http://poison.org/poison");
- } catch (final MalformedURLException e) {
- poison = null;
- }
- }
-
- public static class splitter extends Thread {
-
- private final ArrayBlockingQueue in;
- private final ConcurrentHashMap out;
-
- public splitter(final ArrayBlockingQueue in, final ConcurrentHashMap out) {
- this.in = in;
- this.out = out;
- }
-
- @Override
- public void run() {
- try {
- DigestURI url;
- final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_");
- while (true) {
- try {
- url = this.in.take();
- if (url == poison) break;
- update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\."));
- update(p.matcher(url.getPath()).replaceAll("/").split("/"));
- } catch (final InterruptedException e) {
- Log.logException(e);
- }
- }
- } catch (final Exception e) {
- Log.logException(e);
- }
- }
-
- private void update(final String[] s) {
- Integer c;
- for (final String t: s) {
- if (t.isEmpty()) continue;
- c = this.out.get(t);
- this.out.put(t, (c == null) ? 1 : c.intValue() + 1);
- }
- }
- }
-
- public static void cleanup(final ConcurrentHashMap stat) {
- Map.Entry entry;
- int c, low = Integer.MAX_VALUE;
- Iterator> i = stat.entrySet().iterator();
- while (i.hasNext()) {
- entry = i.next();
- c = entry.getValue().intValue();
- if (c == 1) {
- i.remove();
- } else {
- if (c < low) low = c;
- }
- }
- i = stat.entrySet().iterator();
- while (i.hasNext()) {
- entry = i.next();
- c = entry.getValue().intValue();
- if (c == low) {
- i.remove();
- }
- }
- Runtime.getRuntime().gc();
- }
-
- public static void genstat(final String urlfile) {
-
- final boolean gz = urlfile.endsWith(".gz");
- final String analysis = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".stats.gz" : urlfile + ".stats";
- final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8);
-
- // start threads
- final ArrayBlockingQueue in = new ArrayBlockingQueue(1000);
- final ConcurrentHashMap out = new ConcurrentHashMap();
- for (int i = 0, available = Runtime.getRuntime().availableProcessors(); i < available; i++) new splitter(in, out).start();
- final splitter spl = new splitter(in, out);
- spl.start();
-
- // put urls in queue
- final File infile = new File(urlfile);
- final File outfile = new File(analysis);
- BufferedReader reader = null;
- long time = System.currentTimeMillis();
- final long start = time;
- int count = 0;
-
- System.out.println("start processing");
- try {
- InputStream is = new BufferedInputStream(new FileInputStream(infile));
- if (gz) is = new GZIPInputStream(is);
- reader = new BufferedReader(new InputStreamReader(is));
- String line;
- while ((line = reader.readLine()) != null) {
- line = line.trim();
- if (line.length() > 0) {
- try {
- final DigestURI url = new DigestURI(line);
- in.put(url);
- } catch (final InterruptedException e) {
- Log.logException(e);
- } catch (final MalformedURLException e) {
- continue;
- }
- }
- count++;
- if (System.currentTimeMillis() - time > 1000) {
- time = System.currentTimeMillis();
- System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
- if (MemoryControl.available() < cleanuplimit) {
- System.out.println("starting cleanup, " + out.size() + " entries in statistic");
- cleanup(out);
- System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
- }
- }
- }
- reader.close();
- } catch (final IOException e) {
- Log.logException(e);
- } finally {
- if (reader != null) try { reader.close(); } catch (final Exception e) {}
- }
-
- // stop threads
- System.out.println("stopping threads");
- for (int i = 0, available = Runtime.getRuntime().availableProcessors() + 1; i < available; i++) try {
- in.put(poison);
- } catch (final InterruptedException e) {
- Log.logException(e);
- }
- try {
- spl.join();
- } catch (final InterruptedException e1) {
- Log.logException(e1);
- }
-
- // generate statistics
- System.out.println("start processing results");
- final TreeMap results = new TreeMap();
- count = 0;
- Map.Entry entry;
- final Iterator> i = out.entrySet().iterator();
- while (i.hasNext()) {
- entry = i.next();
- results.put(num(entry.getValue().intValue() * (entry.getKey().length() - 1)) + " - " + entry.getKey(), entry.getValue());
- count++;
- i.remove(); // free memory
- if (System.currentTimeMillis() - time > 10000) {
- time = System.currentTimeMillis();
- System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
- }
- }
-
- // write statistics
- System.out.println("start writing results");
- try {
- OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
- if (gz) os = new GZIPOutputStream(os);
- count = 0;
- for (final Map.Entry e: results.entrySet()) {
- os.write(UTF8.getBytes(e.getKey()));
- os.write(new byte[]{'\t'});
- os.write(UTF8.getBytes(Integer.toString(e.getValue())));
- os.write(new byte[]{'\n'});
- count++;
- if (System.currentTimeMillis() - time > 10000) {
- time = System.currentTimeMillis();
- System.out.println("wrote " + count + " lines.");
- }
- }
- os.close();
- } catch (final IOException e) {
- Log.logException(e);
- }
-
- System.out.println("finished");
- }
-
- public static void genhost(final String urlfile) {
-
- final boolean gz = urlfile.endsWith(".gz");
- final String trunk = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".host" : urlfile + ".host";
- final HashSet hosts = new HashSet();
- final File infile = new File(urlfile);
- BufferedReader reader = null;
- long time = System.currentTimeMillis();
- final long start = time;
- int count = 0;
-
- System.out.println("start processing");
- try {
- InputStream is = new BufferedInputStream(new FileInputStream(infile));
- if (gz) is = new GZIPInputStream(is);
- reader = new BufferedReader(new InputStreamReader(is));
- String line;
- while ((line = reader.readLine()) != null) {
- line = line.trim();
- if (line.length() > 0) {
- try {
- final DigestURI url = new DigestURI(line);
- hosts.add(url.getHost());
- } catch (final MalformedURLException e) {
- continue;
- }
- }
- count++;
- if (System.currentTimeMillis() - time > 1000) {
- time = System.currentTimeMillis();
- System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
- }
- }
- reader.close();
- } catch (final IOException e) {
- Log.logException(e);
- } finally {
- if (reader != null) try { reader.close(); } catch (final Exception e) {}
- }
-
- // copy everything into a TreeSet to order it
- System.out.println("start processing results");
- final TreeSet results = new TreeSet();
- count = 0;
- final Iterator i = hosts.iterator();
- while (i.hasNext()) {
- results.add(i.next());
- count++;
- i.remove(); // free memory
- if (System.currentTimeMillis() - time > 10000) {
- time = System.currentTimeMillis();
- System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
- }
- }
-
- // write hosts
- writeSet(trunk, gz, results);
-
- System.out.println("finished");
- }
-
- private static void writeSet(final String trunk, final boolean gz, final Set set) {
-
- // write hosts
- System.out.println("start writing results");
- final File outfile = new File(trunk + ((gz) ? ".gz" : ""));
- long time = System.currentTimeMillis();
- try {
- OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
- if (gz) os = new GZIPOutputStream(os);
- int count = 0;
- for (final String h: set) {
- os.write(UTF8.getBytes(h));
- os.write(new byte[]{'\n'});
- count++;
- if (System.currentTimeMillis() - time > 10000) {
- time = System.currentTimeMillis();
- System.out.println("wrote " + count + " lines.");
- }
- }
- os.close();
- } catch (final IOException e) {
- Log.logException(e);
- }
-
- System.out.println("finished writing results");
- }
-
- public static void sortsplit(final String urlfile) {
-
- final boolean gz = urlfile.endsWith(".gz");
- final String trunk = ((gz) ? urlfile.substring(0, urlfile.length() - 3) : urlfile) + ".sort";
- final File infile = new File(urlfile);
- final TreeSet urls = new TreeSet();
- BufferedReader reader = null;
- long time = System.currentTimeMillis();
- final long start = time;
- int count = 0;
- int filecount = 0;
- final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8);
-
- System.out.println("start processing");
- try {
- InputStream is = new BufferedInputStream(new FileInputStream(infile));
- if (gz) is = new GZIPInputStream(is);
- reader = new BufferedReader(new InputStreamReader(is));
- String line;
- while ((line = reader.readLine()) != null) {
- line = line.trim();
- if (line.length() > 0) {
- try {
- final DigestURI url = new DigestURI(line);
- urls.add(url.toNormalform(true, true));
- } catch (final MalformedURLException e) {
- continue;
- }
- }
- count++;
- if (System.currentTimeMillis() - time > 1000) {
- time = System.currentTimeMillis();
- System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
- }
- if (MemoryControl.available() < cleanuplimit) {
- writeSet(trunk + "." + filecount, gz, urls);
- filecount++;
- urls.clear();
- Runtime.getRuntime().gc();
- }
- }
- reader.close();
- } catch (final IOException e) {
- Log.logException(e);
- } finally {
- if (reader != null) try { reader.close(); } catch (final Exception e) {}
- }
-
- // write hosts
- writeSet(trunk + "." + filecount, gz, urls);
-
- System.out.println("finished");
- }
-
- public static void incell(final File cellPath, final String statisticPath) {
- try {
- final HandleMap idx = ReferenceContainerArray.referenceHashes(
- cellPath,
- Segment.wordReferenceFactory,
- Base64Order.enhancedCoder,
- WordReferenceRow.urlEntryRow);
- System.out.println("INDEX REFERENCE COLLECTION starting dump of statistics");
- idx.dump(new File(statisticPath));
- System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
- idx.close();
- } catch (final Exception e) {
- Log.logException(e);
- }
- }
-
- public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException {
- System.out.println("INDEX DIFF URL-COL startup");
- final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile));
- final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
- mr.connectUrlDb(Segment.UrlDbName, false, false);
- final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000);
- System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
- final long start = System.currentTimeMillis();
- long update = start - 7000;
- int count = 0;
- for (final byte[] refhash: mr) {
- if (idx.get(refhash) == -1) {
- // the key exists as urlhash in the URL database, but not in the collection as referenced urlhash
- hs.put(refhash);
- }
- count++;
- if (System.currentTimeMillis() - update > 10000) {
- System.out.println("INDEX DIFF URL-COL running, checked " + count + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - count) / count) / 60000) + " minutes remaining");
- update = System.currentTimeMillis();
- }
- }
- idx.close();
- mr.close();
- System.out.println("INDEX DIFF URL-COL finished diff, starting dump to " + diffFile);
- count = hs.dump(new File(diffFile));
- System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump");
- return count;
- }
-
- public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException {
- // format: 0=text, 1=html, 2=rss/xml
- System.out.println("URL EXPORT startup");
- final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
- mr.connectUrlDb(Segment.UrlDbName, false, false);
- final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
- System.out.println("URL EXPORT loaded dump, starting export");
- final Export e = mr.export(new File(export), ".*", hs, format, false);
- try {
- e.join();
- } catch (final InterruptedException e1) {
- Log.logException(e1);
- }
- System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries");
- }
-
- public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException {
- System.out.println("URL DELETE startup");
- final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
- mr.connectUrlDb(Segment.UrlDbName, false, false);
- final int mrSize = mr.size();
- final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
- System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
- for (final byte[] refhash: hs) {
- mr.remove(refhash);
- }
- System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database");
- }
-
- public static void main(final String[] args) {
- if (args[0].equals("-stat") && args.length >= 2) {
- // generate a statistics about common words in file, store to .stat
- // example:
- // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz
- for (int i = 1; i < args.length; i++) genstat(args[i]);
- } else if (args[0].equals("-host") && args.length >= 2) {
- // generate a file .host containing only the hosts of the urls
- for (int i = 1; i < args.length; i++) genhost(args[i]);
- } else if (args[0].equals("-sort") && args.length >= 2) {
- // generate file .x.sort with sorted lists and split the file in smaller pieces
- for (int i = 1; i < args.length; i++) sortsplit(args[i]);
- } else if (args[0].equals("-incell") && args.length >= 2) {
- // generate a dump of all referenced URL hashes from a given RICELL
- // example:
- // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incell DATA/INDEX/freeworld/TEXT/RICELL used.dump
- incell(new File(args[1]), args[2]);
- } else if (args[0].equals("-diffurlcol") && args.length >= 3) {
- // make a diff-file that contains hashes from the url database that do not occur in the collection reference dump
- // example:
- // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump
- try {
- diffurlcol(args[1], args[2], args[3]);
- } catch (final Exception e) {
- Log.logException(e);
- }
- } else if (args[0].equals("-export") && args.length >= 4) {
- // export a url-list file
- // example:
- // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump
- // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
- final int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0;
- try {
- export(args[1], format, args[3], (args.length >= 5) ? args[4] : null);
- } catch (final Exception e) {
- Log.logException(e);
- }
- } else if (args[0].equals("-delete") && args.length >= 3) {
- // delete from URLs as given by urlreference diff dump
- // example:
- // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump
- // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
- try {
- delete(args[1], args[2]);
- } catch (final Exception e) {
- Log.logException(e);
- }
- } else {
- System.out.println("usage:");
- System.out.println();
- System.out.println("-stat ");
- System.out.println(" generate a statistics about common words in file, store to .stat");
- System.out.println();
- System.out.println("-host ");
- System.out.println(" generate a file .host containing only the hosts of the urls");
- System.out.println();
- System.out.println("-sort ");
- System.out.println(" generate file .x.sort with sorted lists and split the file in smaller pieces");
- System.out.println();
- System.out.println("-incollection ");
- System.out.println(" generate a dump of all referenced URL hashes");
- System.out.println();
- System.out.println("-diffurlcol ");
- System.out.println(" find URLs that occur in url-db but not in collections");
- System.out.println();
- System.out.println("-export ");
- System.out.println(" export urls to file. the last argument can be omitted, then all urls are exported");
- System.out.println();
- System.out.println("-delete ");
- System.out.println(" delete all urls that are listed in the diff-dump from the url-db");
- System.out.println();
- System.out.println("to do a complete clean-up of the url database, start the following:");
- System.out.println();
- System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump");
- System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump");
- System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump");
- System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump");
- System.out.println();
- }
- System.exit(0); // kill remaining threads
- }
-
- private static final String num(final int i) {
- final StringBuilder s = new StringBuilder(Integer.toString(i));
- while (s.length() < 9) s.insert(0, "0");
- return s.toString();
- }
-}
diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java
index d872885a4..fd458f54c 100644
--- a/source/net/yacy/search/index/MetadataRepository.java
+++ b/source/net/yacy/search/index/MetadataRepository.java
@@ -33,21 +33,17 @@ import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator;
-import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.services.federated.solr.DoubleSolrConnector;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
-import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
@@ -61,15 +57,11 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.MemoryControl;
-import net.yacy.repository.Blacklist;
-import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.solr.EmbeddedSolrConnector;
import org.apache.lucene.util.Version;
-import de.anomic.crawler.CrawlStacker;
-
public final class MetadataRepository implements /*Metadata,*/ Iterable {
// class objects
@@ -186,26 +178,20 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable
this.solr.close();
}
- public int writeCacheSize() {
- if (this.urlIndexFile != null && this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize();
- if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize();
- return 0;
- }
-
/**
* generates an plasmaLURLEntry using the url hash
* if the url cannot be found, this returns null
* @param obrwi
* @return
*/
- public URIMetadata load(final WeakPriorityBlockingQueue.Element obrwi) {
- if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
- final byte[] urlHash = obrwi.getElement().urlhash();
+ public URIMetadata load(WordReferenceVars wre, long weight) {
+ if (wre == null) return null; // all time was already wasted in takeRWI to get another element
+ final byte[] urlHash = wre.urlhash();
if (urlHash == null) return null;
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null;
- return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight());
+ return new URIMetadataRow(entry, wre, weight);
} catch (final IOException e) {
Log.logException(e);
}
@@ -280,29 +266,25 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable
public boolean exists(final byte[] urlHash) {
if (urlHash == null) return false;
+ if (this.urlIndexFile != null && this.urlIndexFile.has(urlHash)) return true;
try {
if (this.solr.exists(ASCII.String(urlHash))) return true;
} catch (final Throwable e) {
Log.logException(e);
}
- if (this.urlIndexFile == null) return false; // case may happen during shutdown
- return this.urlIndexFile.has(urlHash);
+ return false;
}
- public CloneableIterator keys(final boolean up, final byte[] firstKey) {
+ @Override
+ public Iterator iterator() {
try {
- return this.urlIndexFile.keys(up, firstKey);
+ return this.urlIndexFile.keys(true, null);
} catch (final IOException e) {
Log.logException(e);
return null;
}
}
- @Override
- public Iterator iterator() {
- return keys(true, null);
- }
-
public CloneableIterator entries() throws IOException {
// enumerates entry elements
return new kiter();
@@ -367,186 +349,6 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable
}
}
-
- /**
- * Uses an Iteration over urlHash.db to detect malformed URL-Entries.
- * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
- *
- * @param proxyConfig
- */
- public void deadlinkCleaner() {
- final Log log = new Log("URLDBCLEANUP");
- final HashSet damagedURLS = new HashSet();
- try {
- final Iterator eiter = entries(true, null);
- int iteratorCount = 0;
- while (eiter.hasNext()) try {
- eiter.next();
- iteratorCount++;
- } catch (final RuntimeException e) {
- if(e.getMessage() != null) {
- final String m = e.getMessage();
- damagedURLS.add(m.substring(m.length() - 12));
- } else {
- log.logSevere("RuntimeException:", e);
- }
- }
- log.logInfo("URLs vorher: " + this.urlIndexFile.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
-
- final HTTPClient client = new HTTPClient();
- final Iterator eiter2 = damagedURLS.iterator();
- byte[] urlHashBytes;
- while (eiter2.hasNext()) {
- urlHashBytes = ASCII.getBytes(eiter2.next());
-
- // trying to fix the invalid URL
- String oldUrlStr = null;
- try {
- // getting the url data as byte array
- final Row.Entry entry = this.urlIndexFile.get(urlHashBytes, true);
-
- // getting the wrong url string
- oldUrlStr = entry.getColUTF8(1).trim();
-
- int pos = -1;
- if ((pos = oldUrlStr.indexOf("://",0)) != -1) {
- // trying to correct the url
- final String newUrlStr = "http://" + oldUrlStr.substring(pos + 3);
- final DigestURI newUrl = new DigestURI(newUrlStr);
-
- if (client.HEADResponse(newUrl.toString()) != null
- && client.getHttpResponse().getStatusLine().getStatusCode() == 200) {
- entry.setCol(1, UTF8.getBytes(newUrl.toString()));
- this.urlIndexFile.put(entry);
- if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
- } else {
- remove(urlHashBytes);
- if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (client.getHttpResponse() == null ? "null" : client.getHttpResponse().getStatusLine()));
- }
- }
- } catch (final Exception e) {
- remove(urlHashBytes);
- if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
- }
- }
-
- log.logInfo("URLs nachher: " + size() + " kaputte URLs: " + damagedURLS.size());
- } catch (final IOException e) {
- log.logSevere("IOException", e);
- }
- }
-
- public BlacklistCleaner getBlacklistCleaner(final Blacklist blacklist, final CrawlStacker crawlStacker) {
- return new BlacklistCleaner(blacklist, crawlStacker);
- }
-
- public class BlacklistCleaner extends Thread {
-
- private boolean run = true;
- private boolean pause;
- public int blacklistedUrls = 0;
- public int totalSearchedUrls = 1;
- public String lastBlacklistedUrl = "";
- public String lastBlacklistedHash = "";
- public String lastUrl = "";
- public String lastHash = "";
- private final Blacklist blacklist;
- private final CrawlStacker crawlStacker;
-
- public BlacklistCleaner(final Blacklist blacklist, final CrawlStacker crawlStacker) {
- this.blacklist = blacklist;
- this.crawlStacker = crawlStacker;
- }
-
- @Override
- public void run() {
- try {
- Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
- final Iterator eiter = entries(true, null);
- while (eiter.hasNext() && this.run) {
- synchronized (this) {
- if (this.pause) {
- try {
- this.wait();
- } catch (final InterruptedException e) {
- Log.logWarning("URLDBCLEANER", "InterruptedException", e);
- this.run = false;
- return;
- }
- }
- }
- final URIMetadata entry = eiter.next();
- if (entry == null) {
- if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
- } else if (entry.hash() == null) {
- if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + "hash == null");
- } else {
- this.totalSearchedUrls++;
- if (entry.url() == null) {
- if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + "URL == null");
- remove(entry.hash());
- continue;
- }
- if (this.blacklist.isListed(BlacklistType.CRAWLER, entry) ||
- this.blacklist.isListed(BlacklistType.DHT, entry) ||
- (this.crawlStacker.urlInAcceptedDomain(entry.url()) != null)) {
- this.lastBlacklistedUrl = entry.url().toNormalform(true, true);
- this.lastBlacklistedHash = ASCII.String(entry.hash());
- if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + " " + entry.url().toNormalform(false, true));
- remove(entry.hash());
- if (this.blacklistedUrls % 100 == 0) {
- Log.logInfo("URLDBCLEANER", "Deleted " + this.blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + this.lastBlacklistedUrl);
- }
- }
- this.lastUrl = entry.url().toNormalform(true, true);
- this.lastHash = ASCII.String(entry.hash());
- }
- }
- } catch (final RuntimeException e) {
- if (e.getMessage() != null && e.getMessage().indexOf("not found in LURL",0) != -1) {
- Log.logWarning("URLDBCLEANER", "urlHash not found in LURL", e);
- }
- else {
- Log.logWarning("URLDBCLEANER", "RuntimeException", e);
- this.run = false;
- }
- } catch (final IOException e) {
- Log.logException(e);
- this.run = false;
- } catch (final Exception e) {
- Log.logException(e);
- this.run = false;
- }
- Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread stopped");
- }
-
- public void abort() {
- synchronized(this) {
- this.run = false;
- notifyAll();
- }
- }
-
- public void pause() {
- synchronized(this) {
- if (!this.pause) {
- this.pause = true;
- Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread paused");
- }
- }
- }
-
- public void endPause() {
- synchronized(this) {
- if (this.pause) {
- this.pause = false;
- notifyAll();
- Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread resumed");
- }
- }
- }
- }
-
// export methods
public Export export(final File f, final String filter, final HandleSet set, final int format, final boolean dom) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) {
diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java
index f3c16058d..51b7a3ee4 100644
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@@ -33,7 +33,6 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
-import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
@@ -55,7 +54,6 @@ import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceRow;
-import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
@@ -66,7 +64,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.ISO639;
import net.yacy.kelondro.util.LookAheadIterator;
-import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.Switchboard;
import net.yacy.search.query.RWIProcess;
@@ -252,7 +249,7 @@ public class Segment {
* @param host
* @return an iterator for all url hashes that belong to a specific host
*/
- public Iterator hostSelector(String host) {
+ private Iterator hostSelector(String host) {
String hh = DigestURI.hosthash(host);
final HandleSet ref = new HandleSet(12, Base64Order.enhancedCoder, 100);
for (byte[] b: this.urlMetadata) {
@@ -551,12 +548,6 @@ public class Segment {
return newEntry;
}
-
- // method for index deletion
- public int removeAllUrlReferences(final DigestURI url, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {
- return removeAllUrlReferences(url.hash(), loader, cacheStrategy);
- }
-
public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {
for (final byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy);
}
@@ -604,129 +595,4 @@ public class Segment {
}
}
-
- // The Cleaner class was provided as "UrldbCleaner" by Hydrox
- public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) {
- return new ReferenceCleaner(startHash);
- }
-
- public class ReferenceCleaner extends Thread {
-
- private final byte[] startHash;
- private boolean run = true;
- private boolean pause = false;
- public int rwiCountAtStart = 0;
- public byte[] wordHashNow = null;
- public byte[] lastWordHash = null;
- public int lastDeletionCounter = 0;
-
- public ReferenceCleaner(final byte[] startHash) {
- this.startHash = startHash;
- this.rwiCountAtStart = termIndex().sizesMax();
- }
-
- @Override
- public void run() {
- Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
- ReferenceContainer container = null;
- WordReferenceVars entry = null;
- DigestURI url = null;
- final HandleSet urlHashs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
- try {
- Iterator> indexContainerIterator = Segment.this.termIndex.referenceContainer(this.startHash, false, false, 100, false).iterator();
- while (indexContainerIterator.hasNext() && this.run) {
- waiter();
- container = indexContainerIterator.next();
- final Iterator containerIterator = container.entries();
- this.wordHashNow = container.getTermHash();
- while (containerIterator.hasNext() && this.run) {
- waiter();
- entry = new WordReferenceVars(containerIterator.next());
- // System.out.println("Wordhash: "+wordHash+" UrlHash:
- // "+entry.getUrlHash());
- final URIMetadata ue = Segment.this.urlMetadata.load(entry.urlhash());
- if (ue == null) {
- urlHashs.put(entry.urlhash());
- } else {
- url = ue.url();
- if (url == null || Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
- urlHashs.put(entry.urlhash());
- }
- }
- }
- if (!urlHashs.isEmpty()) try {
- final int removed = Segment.this.termIndex.remove(container.getTermHash(), urlHashs);
- Log.logFine("INDEXCLEANER", ASCII.String(container.getTermHash()) + ": " + removed + " of " + container.size() + " URL-entries deleted");
- this.lastWordHash = container.getTermHash();
- this.lastDeletionCounter = urlHashs.size();
- urlHashs.clear();
- } catch (final IOException e) {
- Log.logException(e);
- }
-
- if (!containerIterator.hasNext()) {
- // We may not be finished yet, try to get the next chunk of wordHashes
- final TreeSet> containers = Segment.this.termIndex.referenceContainer(container.getTermHash(), false, false, 100, false);
- indexContainerIterator = containers.iterator();
- // Make sure we don't get the same wordhash twice, but don't skip a word
- if ((indexContainerIterator.hasNext()) && (!container.getTermHash().equals(indexContainerIterator.next().getTermHash()))) {
- indexContainerIterator = containers.iterator();
- }
- }
- }
- } catch (final IOException e) {
- Log.logException(e);
- } catch (final Exception e) {
- Log.logException(e);
- }
- Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread stopped");
- }
-
- public void abort() {
- synchronized(this) {
- this.run = false;
- notifyAll();
- }
- }
-
- public void pause() {
- synchronized (this) {
- if (!this.pause) {
- this.pause = true;
- Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread paused");
- }
- }
- }
-
- public void endPause() {
- synchronized (this) {
- if (this.pause) {
- this.pause = false;
- notifyAll();
- Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread resumed");
- }
- }
- }
-
- public void waiter() {
- synchronized (this) {
- if (this.pause) {
- try {
- this.wait();
- } catch (final InterruptedException e) {
- this.run = false;
- return;
- }
- }
- }
- }
-
- public int rwisize() {
- return termIndex().sizesMax();
- }
-
- public int urlsize() {
- return urlMetadata().size();
- }
- }
}
diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java
index 3248598d8..93e975e9e 100644
--- a/source/net/yacy/search/query/RWIProcess.java
+++ b/source/net/yacy/search/query/RWIProcess.java
@@ -628,7 +628,7 @@ public final class RWIProcess extends Thread
if ( obrwi == null ) {
return null; // all time was already wasted in takeRWI to get another element
}
- final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi);
+ final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi.getElement(), obrwi.getWeight());
if ( page == null ) {
try {
this.misses.putUnique(obrwi.getElement().urlhash());
diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java
index d10281b63..b9e44cd7e 100644
--- a/source/net/yacy/yacy.java
+++ b/source/net/yacy/yacy.java
@@ -24,7 +24,6 @@
package net.yacy;
-import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
@@ -38,54 +37,33 @@ import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
-import java.util.Iterator;
-import java.util.Map;
import java.util.Properties;
-import java.util.TreeMap;
-import java.util.TreeSet;
import java.util.concurrent.Semaphore;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipOutputStream;
import net.yacy.cora.date.GenericFormatter;
-import net.yacy.cora.document.ASCII;
-import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.sorting.Array;
-import net.yacy.cora.sorting.OrderedScoreMap;
-import net.yacy.cora.sorting.ScoreMap;
import net.yacy.gui.YaCyApp;
import net.yacy.gui.framework.Browser;
-import net.yacy.kelondro.blob.MapDataMining;
-import net.yacy.kelondro.data.meta.URIMetadata;
-import net.yacy.kelondro.data.word.Word;
-import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.order.Base64Order;
-import net.yacy.kelondro.rwi.Reference;
-import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.OS;
-import net.yacy.peers.SeedDB;
import net.yacy.peers.operation.yacyBuildProperties;
import net.yacy.peers.operation.yacyRelease;
import net.yacy.peers.operation.yacyVersion;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
-import net.yacy.search.index.MetadataRepository;
-import net.yacy.search.index.Segment;
import com.google.common.io.Files;
import de.anomic.data.Translator;
import de.anomic.http.server.HTTPDemon;
import de.anomic.server.serverCore;
-import de.anomic.tools.enumerateFiles;
/**
* This is the main class of YaCy. Several threads are started from here:
@@ -595,346 +573,6 @@ public final class yacy {
Log.logConfig("COMMAND-STEERING", "SUCCESSFULLY FINISHED COMMAND: " + processdescription);
}
- /**
- * This method gets all found words and outputs a statistic about the score
- * of the words. The output of this method can be used to create stop-word
- * lists. This method will be called if you start yacy with the argument
- * -genwordstat.
- * FIXME: How can stop-word list be created from this output? What type of
- * score is output?
- *
- * @param homePath Root-Path where all the information is to be found.
- */
- private static void genWordstat(final File homePath) {
- // start up
- System.out.println(copyright);
- System.out.println(hline);
-
- // load words
- Log.logInfo("GEN-WORDSTAT", "loading words...");
- final TreeMap words = loadWordMap(new File(homePath, "yacy.words"));
-
- // find all hashes
- Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases...");
- final File dbRoot = new File(homePath, "DATA/INDEX/freeworld/");
- final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true);
- File f;
- byte[] h;
- final ScoreMap hs = new OrderedScoreMap(Base64Order.standardCoder);
- while (ef.hasMoreElements()) {
- f = ef.nextElement();
- h = f.getName().substring(0, Word.commonHashLength).getBytes();
- hs.inc(h, (int) f.length());
- }
-
- // list the hashes in reverse order
- Log.logInfo("GEN-WORDSTAT", "listing words in reverse size order...");
- String w;
- final Iterator i = hs.keys(false);
- while (i.hasNext()) {
- h = i.next();
- w = words.get(h);
- if (w == null) System.out.print("# " + h); else System.out.print(w);
- System.out.println(" - " + hs.get(h));
- }
-
- // finished
- Log.logConfig("GEN-WORDSTAT", "FINISHED");
- }
-
- /**
- * @param homePath path to the YaCy directory
- * @param networkName
- */
- public static void minimizeUrlDB(final File dataHome, final File appHome, final String networkName) {
- // run with "java -classpath classes yacy -minimizeUrlDB"
- try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
- final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX");
- final File indexRoot2 = new File(dataHome, "DATA/INDEX2");
- final Log log = new Log("URL-CLEANUP");
- try {
- log.logInfo("STARTING URL CLEANUP");
-
- // db containing all currently loades urls
- final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"));
- currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
-
- // db used to hold all neede urls
- final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"));
- minimizedUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
-
- final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
- if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
-
- final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null);
- wordIndex.connectRWI(10000, Integer.MAX_VALUE);
- wordIndex.connectUrlDb(false, false);
- final Iterator> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
-
- long urlCounter = 0, wordCounter = 0;
- long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
- String wordChunkStartHash = "AAAAAAAAAAAA", wordChunkEndHash;
-
- while (indexContainerIterator.hasNext()) {
- ReferenceContainer wordIdxContainer = null;
- try {
- wordCounter++;
- wordIdxContainer = indexContainerIterator.next();
-
- // the combined container will fit, read the container
- final Iterator wordIdxEntries = wordIdxContainer.entries();
- Reference iEntry;
- while (wordIdxEntries.hasNext()) {
- iEntry = wordIdxEntries.next();
- final byte[] urlHash = iEntry.urlhash();
- if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
- final URIMetadata urlEntry = currentUrlDB.load(urlHash);
- urlCounter++;
- minimizedUrlDB.store(urlEntry);
- if (urlCounter % 500 == 0) {
- log.logInfo(urlCounter + " URLs found so far.");
- }
- } catch (final IOException e) {}
- }
-
- if (wordCounter%500 == 0) {
- wordChunkEndHash = ASCII.String(wordIdxContainer.getTermHash());
- wordChunkEnd = System.currentTimeMillis();
- final long duration = wordChunkEnd - wordChunkStart;
- log.logInfo(wordCounter + " words scanned " +
- "[" + wordChunkStartHash + " .. " + wordChunkEndHash + "]\n" +
- "Duration: "+ 500*1000/duration + " words/s" +
- " | Free memory: " + MemoryControl.free() +
- " | Total memory: " + MemoryControl.total());
- wordChunkStart = wordChunkEnd;
- wordChunkStartHash = wordChunkEndHash;
- }
-
- // we have read all elements, now we can close it
- wordIdxContainer = null;
-
- } catch (final Exception e) {
- log.logSevere("Exception", e);
- } finally {
- if (wordIdxContainer != null) try { wordIdxContainer = null; } catch (final Exception e) {}
- }
- }
- log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries.");
- log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries.");
-
- currentUrlDB.close();
- minimizedUrlDB.close();
- wordIndex.close();
-
- // TODO: rename the mimimized UrlDB to the name of the previous UrlDB
-
- log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP");
- log.logInfo("You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db");
-
- log.logInfo("TERMINATED URL CLEANUP");
- } catch (final Exception e) {
- log.logSevere("Exception: " + e.getMessage(), e);
- } catch (final Error e) {
- log.logSevere("Error: " + e.getMessage(), e);
- }
- }
-
- /**
- * Reads all words from the given file and creates a treemap, where key is
- * the plasma word hash and value is the word itself.
- *
- * @param wordlist File where the words are stored.
- * @return HashMap with the hash-word - relation.
- */
- private static TreeMap loadWordMap(final File wordlist) {
- // returns a hash-word - Relation
- final TreeMap wordmap = new TreeMap(Base64Order.enhancedCoder);
- try {
- String word;
- final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
- while ((word = br.readLine()) != null) wordmap.put(Word.word2hash(word), word);
- br.close();
- } catch (final IOException e) {}
- return wordmap;
- }
-
- /**
- * Cleans a wordlist in a file according to the length of the words. The
- * file with the given filename is read and then only the words in the given
- * length-range are written back to the file.
- *
- * @param wordlist Name of the file the words are stored in.
- * @param minlength Minimal needed length for each word to be stored.
- * @param maxlength Maximal allowed length for each word to be stored.
- */
- private static void cleanwordlist(final String wordlist, final int minlength, final int maxlength) {
- // start up
- System.out.println(copyright);
- System.out.println(hline);
- Log.logConfig("CLEAN-WORDLIST", "START");
-
- String word;
- final TreeSet wordset = new TreeSet();
- int count = 0;
- try {
- final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
- final String seps = "' .,:/-&";
- while ((word = br.readLine()) != null) {
- word = word.toLowerCase().trim();
- for (int i = 0; i < seps.length(); i++) {
- if (word.indexOf(seps.charAt(i)) >= 0) word = word.substring(0, word.indexOf(seps.charAt(i)));
- }
- if ((word.length() >= minlength) && (word.length() <= maxlength)) wordset.add(word);
- count++;
- }
- br.close();
-
- if (wordset.size() != count) {
- count = count - wordset.size();
- final BufferedWriter bw = new BufferedWriter(new PrintWriter(new FileWriter(wordlist)));
- while (!wordset.isEmpty()) {
- word = wordset.first();
- bw.write(word + "\n");
- wordset.remove(word);
- }
- bw.close();
- Log.logInfo("CLEAN-WORDLIST", "shrinked wordlist by " + count + " words.");
- } else {
- Log.logInfo("CLEAN-WORDLIST", "not necessary to change wordlist");
- }
- } catch (final IOException e) {
- Log.logSevere("CLEAN-WORDLIST", "ERROR: " + e.getMessage());
- System.exit(-1);
- }
-
- // finished
- Log.logConfig("CLEAN-WORDLIST", "FINISHED");
- }
-
- private static String[] shift(final String[] args, final int pos, final int count) {
- final String[] newargs = new String[args.length - count];
- System.arraycopy(args, 0, newargs, 0, pos);
- System.arraycopy(args, pos + count, newargs, pos, args.length - pos - count);
- return newargs;
- }
-
- /**
- * Uses an Iteration over urlHash.db to detect malformed URL-Entries.
- * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
- *
- * @param homePath Root-Path where all information is to be found.
- */
- private static void urldbcleanup(final File dataHome, final File appHome, final String networkName) {
- final File root = dataHome;
- final File indexroot = new File(root, "DATA/INDEX");
- try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
- final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"));
- currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
- currentUrlDB.deadlinkCleaner();
- currentUrlDB.close();
- }
-
- private static void RWIHashList(final File dataHome, final File appHome, final String targetName, final String resource, final String format) {
- Segment WordIndex = null;
- final Log log = new Log("HASHLIST");
- final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX");
- final String wordChunkStartHash = "AAAAAAAAAAAA";
- try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
- log.logInfo("STARTING CREATION OF RWI-HASHLIST");
- final File root = dataHome;
- try {
- Iterator> indexContainerIterator = null;
- if (resource.equals("all")) {
- WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null);
- WordIndex.connectRWI(10000, Integer.MAX_VALUE);
- WordIndex.connectUrlDb(false, false);
- indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
- }
- int counter = 0;
- ReferenceContainer container = null;
- if (format.equals("zip")) {
- log.logInfo("Writing Hashlist to ZIP-file: " + targetName + ".zip");
- final ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
- final File file = new File(root, targetName + ".zip");
- final ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
- bos.putNextEntry(zipEntry);
- if(indexContainerIterator != null) {
- while (indexContainerIterator.hasNext()) {
- counter++;
- container = indexContainerIterator.next();
- bos.write(container.getTermHash());
- bos.write(serverCore.CRLF);
- if (counter % 500 == 0) {
- log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash()));
- }
- }
- }
- bos.flush();
- bos.close();
- } else {
- log.logInfo("Writing Hashlist to TXT-file: " + targetName + ".txt");
- final File file = new File(root, targetName + ".txt");
- final BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
- if(indexContainerIterator != null) {
- while (indexContainerIterator.hasNext()) {
- counter++;
- container = indexContainerIterator.next();
- bos.write(container.getTermHash());
- bos.write(serverCore.CRLF);
- if (counter % 500 == 0) {
- log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash()));
- }
- }
- }
- bos.flush();
- bos.close();
- }
- log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : ASCII.String(container.getTermHash())));
- } catch (final IOException e) {
- log.logSevere("IOException", e);
- }
- if (WordIndex != null) {
- WordIndex.close();
- WordIndex = null;
- }
- }
-
- /**
- * Searching for peers affected by Bug
- * @param homePath
- */
- public static void testPeerDB(final File homePath) {
-
- try {
- final File yacyDBPath = new File(homePath, "DATA/INDEX/freeworld/NETWORK");
-
- final String[] dbFileNames = {"seed.new.db","seed.old.db","seed.pot.db"};
- for (final String dbFileName : dbFileNames) {
- final File dbFile = new File(yacyDBPath,dbFileName);
- final MapDataMining db = new MapDataMining(dbFile, Word.commonHashLength, Base64Order.enhancedCoder, 1024 * 512, 500, SeedDB.sortFields, SeedDB.longaccFields, SeedDB.doubleaccFields);
-
- Iterator>> it;
- it = db.entries(true, false);
- while (it.hasNext()) {
- final Map.Entry> dna = it.next();
- String peerHash = UTF8.String(dna.getKey());
- if (peerHash.length() < Word.commonHashLength) {
- final String peerName = dna.getValue().get("Name");
- final String peerIP = dna.getValue().get("IP");
- final String peerPort = dna.getValue().get("Port");
-
- while (peerHash.length() < Word.commonHashLength) { peerHash = peerHash + "_"; }
- System.err.println("Invalid Peer-Hash found in '" + dbFileName + "': " + peerName + ":" + peerHash + ", http://" + peerIP + ":" + peerPort);
- }
- }
- db.close();
- }
- } catch (final Exception e) {
- Log.logException(e);
- }
- }
-
-
/**
* Main-method which is started by java. Checks for special arguments or
* starts up the application.
@@ -993,46 +631,6 @@ public final class yacy {
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-version"))) {
// show yacy version
System.out.println(copyright);
- } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) {
- // migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
- // attention: this may run long and should not be interrupted!
- if (args.length >= 3 && args[1].toLowerCase().equals("-cache")) {
- args = shift(args, 1, 2);
- }
- if (args.length == 2) applicationRoot= new File(args[1]);
- minimizeUrlDB(dataRoot, applicationRoot, "freeworld");
- } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-testpeerdb"))) {
- if (args.length == 2) {
- applicationRoot = new File(args[1]);
- } else if (args.length > 2) {
- System.err.println("Usage: -testPeerDB [homeDbRoot]");
- }
- testPeerDB(applicationRoot);
- } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-genwordstat"))) {
- // this can help to create a stop-word list
- // to use this, you need a 'yacy.words' file in the root path
- // start this with "java -classpath classes yacy -genwordstat []"
- if (args.length == 2) applicationRoot= new File(args[1]);
- genWordstat(applicationRoot);
- } else if ((args.length == 4) && (args[0].toLowerCase().equals("-cleanwordlist"))) {
- // this can be used to organize and clean a word-list
- // start this with "java -classpath classes yacy -cleanwordlist "
- final int minlength = Integer.parseInt(args[2]);
- final int maxlength = Integer.parseInt(args[3]);
- cleanwordlist(args[1], minlength, maxlength);
- } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
- // generate a url list and save it in a file
- if (args.length == 2) applicationRoot= new File(args[1]);
- urldbcleanup(dataRoot, applicationRoot, "freeworld");
- } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-rwihashlist"))) {
- // generate a url list and save it in a file
- String domain = "all";
- String format = "txt";
- if (args.length >= 2) domain= args[1];
- if (args.length >= 3) format= args[2];
- if (args.length == 4) applicationRoot= new File(args[3]);
- final String outfile = "rwihashlist_" + System.currentTimeMillis();
- RWIHashList(dataRoot, applicationRoot, outfile, domain, format);
} else {
if (args.length == 1) applicationRoot= new File(args[0]);
startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, false);