diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java
index 89196a279..c4be0da99 100644
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@@ -334,16 +334,14 @@ public class IndexControl_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
- final Iterator hashIt = switchboard.urlPool.loadedURL.urlHashes(urlhash, true);
-
- StringBuffer result = new StringBuffer(
- "Sequential List of URL-Hashes:
");
- String hash;
+ final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
+ StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:
");
+ plasmaCrawlLURL.Entry entry;
int i = 0;
- while (hashIt.hasNext() && i < 256) {
- hash = (String) hashIt.next();
- result.append("").append(hash).append(" ").append(((i + 1) % 8 == 0) ? "
" : "");
+ while (entryIt.hasNext() && i < 256) {
+ entry = (plasmaCrawlLURL.Entry) entryIt.next();
+ result.append("").append(entry.hash()).append(" ").append(((i + 1) % 8 == 0) ? "
" : "");
i++;
}
prop.put("result", result.toString());
diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java
index b401da9b7..df44befc4 100644
--- a/source/de/anomic/index/indexURL.java
+++ b/source/de/anomic/index/indexURL.java
@@ -31,7 +31,6 @@ import de.anomic.net.URL;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
-import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
@@ -536,8 +535,4 @@ public class indexURL {
return hash;
}
- public Iterator urlHashes(String urlHash, boolean up) throws IOException {
- return urlHashCache.keys(up, false, urlHash.getBytes());
- }
-
}
diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
index 3f80cc108..e3368dc6f 100644
--- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
@@ -9,7 +9,6 @@ import java.util.TreeMap;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaCrawlNURL.Entry;
public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImporter {
@@ -118,25 +117,25 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
// getting an interator and loop through the URL entries
- Iterator iter = (stackTypes[i] == -1)?this.importNurlDB.urlHashes("------------", true):null;
+ Iterator entryIter = (stackTypes[i] == -1) ? this.importNurlDB.entries(true, false, null) : null;
while (true) {
String nextHash = null;
- Entry urlEntry = null;
+ plasmaCrawlNURL.Entry nextEntry = null;
try {
if (stackTypes[i] != -1) {
if (this.importNurlDB.stackSize(stackTypes[i]) == 0) break;
this.urlCount++;
- urlEntry = this.importNurlDB.pop(stackTypes[i]);
- nextHash = urlEntry.hash();
+ nextEntry = this.importNurlDB.pop(stackTypes[i]);
+ nextHash = nextEntry.hash();
} else {
- if (!iter.hasNext()) break;
+ if (!entryIter.hasNext()) break;
this.urlCount++;
- nextHash = (String)iter.next();
- urlEntry = this.importNurlDB.getEntry(nextHash);
+ nextEntry = (plasmaCrawlNURL.Entry) entryIter.next();
+ nextHash = nextEntry.hash();
}
} catch (IOException e) {
this.log.logWarning("Unable to import entry: " + e.toString());
@@ -147,7 +146,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// getting a handler to the crawling profile the url belongs to
try {
- String profileHandle = urlEntry.profileHandle();
+ String profileHandle = nextEntry.profileHandle();
if (profileHandle == null) {
this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown.");
continue;
@@ -176,7 +175,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// if the url does not alredy exists in the destination stack we insert it now
if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) {
- plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(urlEntry);
+ plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(nextEntry);
ne.store();
this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
}
diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java
index 57c94a6d8..3f91d1545 100644
--- a/source/de/anomic/plasma/plasmaCrawlNURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlNURL.java
@@ -46,6 +46,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import de.anomic.net.URL;
+
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
@@ -497,23 +498,9 @@ public class plasmaCrawlNURL extends indexURL {
this.hash = hash;
kelondroRow.Entry entry = urlHashCache.get(hash.getBytes());
if (entry != null) {
- //try {
- this.initiator = entry.getColString(1, null);
- this.url = new URL(entry.getColString(2, null).trim());
- this.referrer = (entry.empty(3)) ? dummyHash : entry.getColString(3, null);
- this.name = (entry.empty(4)) ? "" : entry.getColString(4, null).trim();
- this.loaddate = new Date(86400000 * entry.getColLongB64E(5));
- this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
- this.depth = (int) entry.getColLongB64E(7);
- this.anchors = (int) entry.getColLongB64E(8);
- this.forkfactor = (int) entry.getColLongB64E(9);
- this.flags = new bitfield(entry.getColBytes(10));
- this.handle = Integer.parseInt(entry.getColString(11, null), 16);
- this.stored = true;
- return;
- //} catch (MalformedURLException e) {
- // throw new IOException("plasmaCrawlNURL/Entry: " + e);
- //}
+ insertEntry(entry);
+ this.stored = true;
+ return;
} else {
// show that we found nothing
throw new IOException("NURL: hash " + hash + " not found");
@@ -521,6 +508,28 @@ public class plasmaCrawlNURL extends indexURL {
}
}
+ public Entry(kelondroRow.Entry entry) throws IOException {
+ assert (entry != null);
+ insertEntry(entry);
+ this.stored = false;
+ }
+
+ private void insertEntry(kelondroRow.Entry entry) throws IOException {
+ this.hash = entry.getColString(0, null);
+ this.initiator = entry.getColString(1, null);
+ this.url = new URL(entry.getColString(2, null).trim());
+ this.referrer = (entry.empty(3)) ? dummyHash : entry.getColString(3, null);
+ this.name = (entry.empty(4)) ? "" : entry.getColString(4, null).trim();
+ this.loaddate = new Date(86400000 * entry.getColLongB64E(5));
+ this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
+ this.depth = (int) entry.getColLongB64E(7);
+ this.anchors = (int) entry.getColLongB64E(8);
+ this.forkfactor = (int) entry.getColLongB64E(9);
+ this.flags = new bitfield(entry.getColBytes(10));
+ this.handle = Integer.parseInt(entry.getColString(11, null), 16);
+ return;
+ }
+
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
@@ -616,4 +625,40 @@ public class plasmaCrawlNURL extends indexURL {
}
}
+ public class kiter implements Iterator {
+ // enumerates entry elements
+ Iterator i;
+ boolean error = false;
+
+ public kiter(boolean up, boolean rotating, String firstHash) throws IOException {
+ i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
+ error = false;
+ }
+
+ public boolean hasNext() {
+ if (error) return false;
+ return i.hasNext();
+ }
+
+ public Object next() throws RuntimeException {
+ kelondroRow.Entry e = (kelondroRow.Entry) i.next();
+ if (e == null) return null;
+ try {
+ return new Entry(e);
+ } catch (IOException ex) {
+ throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
+ }
+ }
+
+ public void remove() {
+ i.remove();
+ }
+
+ }
+
+ public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException {
+ // enumerates entry elements
+ return new kiter(up, rotating, firstHash);
+ }
+
}
diff --git a/source/yacy.java b/source/yacy.java
index c1d0e1f01..afc19a44f 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -80,6 +80,7 @@ import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlEURL;
+import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool;
import de.anomic.plasma.plasmaWordIndex;
@@ -941,7 +942,7 @@ public final class yacy {
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
- System.out.println("java -Xmxm -classpath classes yacy -domlist [ -source { lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ ]");
+ System.out.println("java -Xmxm -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ ]");
int c = 0;
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
@@ -982,6 +983,25 @@ public final class yacy {
((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
+ if (source.equals("nurl")) {
+ Iterator eiter = pool.noticeURL.entries(true, false, null);
+ plasmaCrawlNURL.Entry entry;
+ while (eiter.hasNext()) {
+ try {
+ entry = (plasmaCrawlNURL.Entry) eiter.next();
+ if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth());
+ } catch (Exception e) {
+ // here a MalformedURLException may occur
+ // just ignore
+ }
+ c++;
+ if (c % 10000 == 0) System.out.println(
+ c + " urls checked, " +
+ doms.size() + " domains collected, " +
+ ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
+ ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
+ }
+ }
if (format.equals("html")) {
// output file in HTML format
@@ -999,7 +1019,7 @@ public final class yacy {
entry = (Map.Entry) i.next();
key = (String) entry.getKey();
bos.write(("" + key + "" +
- ((entry.getValue() == null) ? "" : ((String) entry.getValue())) + "
"
+ ((entry.getValue() == null) ? "" : (" " + ((String) entry.getValue()))) + "
"
).getBytes());
bos.write(serverCore.crlf);
}
@@ -1068,6 +1088,22 @@ public final class yacy {
}
}
}
+ if (source.equals("nurl")) {
+ Iterator eiter = pool.noticeURL.entries(true, false, null);
+ plasmaCrawlNURL.Entry entry;
+ while (eiter.hasNext()) {
+ entry = (plasmaCrawlNURL.Entry) eiter.next();
+ if ((entry != null) && (entry.url() != null)) {
+ if (html) {
+ bos.write(("" + entry.url() + " " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "
").getBytes("UTF-8"));
+ bos.write(serverCore.crlf);
+ } else {
+ bos.write(entry.url().toString().getBytes());
+ bos.write(serverCore.crlf);
+ }
+ }
+ }
+ }
bos.close();
pool.close();
} catch (IOException e) {
@@ -1294,7 +1330,8 @@ public final class yacy {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
- if ((args[2].equals("lurl")) ||
+ if ((args[2].equals("nurl")) ||
+ (args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
@@ -1308,13 +1345,14 @@ public final class yacy {
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
- String outfile = "domlist_" + System.currentTimeMillis();
+ String outfile = "domlist_" + source + "_" + System.currentTimeMillis();
domlist(applicationRoot, source, format, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urllist"))) {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
- if ((args[2].equals("lurl")) ||
+ if ((args[2].equals("nurl")) ||
+ (args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
@@ -1325,7 +1363,7 @@ public final class yacy {
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
- String outfile = "urllist_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
+ String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file