diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 89196a279..c4be0da99 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -334,16 +334,14 @@ public class IndexControl_p { // generate list if (post.containsKey("urlhashsimilar")) { try { - final Iterator hashIt = switchboard.urlPool.loadedURL.urlHashes(urlhash, true); - - StringBuffer result = new StringBuffer( - "Sequential List of URL-Hashes:
"); - String hash; + final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash); + StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:
"); + plasmaCrawlLURL.Entry entry; int i = 0; - while (hashIt.hasNext() && i < 256) { - hash = (String) hashIt.next(); - result.append("").append(hash).append(" ").append(((i + 1) % 8 == 0) ? "
" : ""); + while (entryIt.hasNext() && i < 256) { + entry = (plasmaCrawlLURL.Entry) entryIt.next(); + result.append("").append(entry.hash()).append(" ").append(((i + 1) % 8 == 0) ? "
" : ""); i++; } prop.put("result", result.toString()); diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index b401da9b7..df44befc4 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -31,7 +31,6 @@ import de.anomic.net.URL; import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.HashMap; -import java.util.Iterator; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.kelondro.kelondroBase64Order; @@ -536,8 +535,4 @@ public class indexURL { return hash; } - public Iterator urlHashes(String urlHash, boolean up) throws IOException { - return urlHashCache.keys(up, false, urlHash.getBytes()); - } - } diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index 3f80cc108..e3368dc6f 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -9,7 +9,6 @@ import java.util.TreeMap; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaCrawlNURL.Entry; public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImporter { @@ -118,25 +117,25 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor } // getting an interator and loop through the URL entries - Iterator iter = (stackTypes[i] == -1)?this.importNurlDB.urlHashes("------------", true):null; + Iterator entryIter = (stackTypes[i] == -1) ? this.importNurlDB.entries(true, false, null) : null; while (true) { String nextHash = null; - Entry urlEntry = null; + plasmaCrawlNURL.Entry nextEntry = null; try { if (stackTypes[i] != -1) { if (this.importNurlDB.stackSize(stackTypes[i]) == 0) break; this.urlCount++; - urlEntry = this.importNurlDB.pop(stackTypes[i]); - nextHash = urlEntry.hash(); + nextEntry = this.importNurlDB.pop(stackTypes[i]); + nextHash = nextEntry.hash(); } else { - if (!iter.hasNext()) break; + if (!entryIter.hasNext()) break; this.urlCount++; - nextHash = (String)iter.next(); - urlEntry = this.importNurlDB.getEntry(nextHash); + nextEntry = (plasmaCrawlNURL.Entry) entryIter.next(); + nextHash = nextEntry.hash(); } } catch (IOException e) { this.log.logWarning("Unable to import entry: " + e.toString()); @@ -147,7 +146,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor // getting a handler to the crawling profile the url belongs to try { - String profileHandle = urlEntry.profileHandle(); + String profileHandle = nextEntry.profileHandle(); if (profileHandle == null) { this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown."); continue; @@ -176,7 +175,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor // if the url does not alredy exists in the destination stack we insert it now if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) { - plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(urlEntry); + plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(nextEntry); ne.store(); this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash()); } diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 57c94a6d8..3f91d1545 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -46,6 +46,7 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; import de.anomic.net.URL; + import java.util.ArrayList; import java.util.Date; import java.util.HashSet; @@ -497,23 +498,9 @@ public class plasmaCrawlNURL extends indexURL { this.hash = hash; kelondroRow.Entry entry = urlHashCache.get(hash.getBytes()); if (entry != null) { - //try { - this.initiator = entry.getColString(1, null); - this.url = new URL(entry.getColString(2, null).trim()); - this.referrer = (entry.empty(3)) ? dummyHash : entry.getColString(3, null); - this.name = (entry.empty(4)) ? "" : entry.getColString(4, null).trim(); - this.loaddate = new Date(86400000 * entry.getColLongB64E(5)); - this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim(); - this.depth = (int) entry.getColLongB64E(7); - this.anchors = (int) entry.getColLongB64E(8); - this.forkfactor = (int) entry.getColLongB64E(9); - this.flags = new bitfield(entry.getColBytes(10)); - this.handle = Integer.parseInt(entry.getColString(11, null), 16); - this.stored = true; - return; - //} catch (MalformedURLException e) { - // throw new IOException("plasmaCrawlNURL/Entry: " + e); - //} + insertEntry(entry); + this.stored = true; + return; } else { // show that we found nothing throw new IOException("NURL: hash " + hash + " not found"); @@ -521,6 +508,28 @@ public class plasmaCrawlNURL extends indexURL { } } + public Entry(kelondroRow.Entry entry) throws IOException { + assert (entry != null); + insertEntry(entry); + this.stored = false; + } + + private void insertEntry(kelondroRow.Entry entry) throws IOException { + this.hash = entry.getColString(0, null); + this.initiator = entry.getColString(1, null); + this.url = new URL(entry.getColString(2, null).trim()); + this.referrer = (entry.empty(3)) ? dummyHash : entry.getColString(3, null); + this.name = (entry.empty(4)) ? "" : entry.getColString(4, null).trim(); + this.loaddate = new Date(86400000 * entry.getColLongB64E(5)); + this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim(); + this.depth = (int) entry.getColLongB64E(7); + this.anchors = (int) entry.getColLongB64E(8); + this.forkfactor = (int) entry.getColLongB64E(9); + this.flags = new bitfield(entry.getColBytes(10)); + this.handle = Integer.parseInt(entry.getColString(11, null), 16); + return; + } + public void store() { // stores the values from the object variables into the database if (this.stored) return; @@ -616,4 +625,40 @@ public class plasmaCrawlNURL extends indexURL { } } + public class kiter implements Iterator { + // enumerates entry elements + Iterator i; + boolean error = false; + + public kiter(boolean up, boolean rotating, String firstHash) throws IOException { + i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); + error = false; + } + + public boolean hasNext() { + if (error) return false; + return i.hasNext(); + } + + public Object next() throws RuntimeException { + kelondroRow.Entry e = (kelondroRow.Entry) i.next(); + if (e == null) return null; + try { + return new Entry(e); + } catch (IOException ex) { + throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); + } + } + + public void remove() { + i.remove(); + } + + } + + public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException { + // enumerates entry elements + return new kiter(up, rotating, firstHash); + } + } diff --git a/source/yacy.java b/source/yacy.java index c1d0e1f01..afc19a44f 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -80,6 +80,7 @@ import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlEURL; +import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURLPool; import de.anomic.plasma.plasmaWordIndex; @@ -941,7 +942,7 @@ public final class yacy { System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries."); System.out.println("a dump will be written after double-check of all extracted domains."); System.out.println("This process may fail in case of too less memory. To increase memory, start with"); - System.out.println("java -Xmxm -classpath classes yacy -domlist [ -source { lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ ]"); + System.out.println("java -Xmxm -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ ]"); int c = 0; long start = System.currentTimeMillis(); if (source.equals("lurl")) { @@ -982,6 +983,25 @@ public final class yacy { ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining."); } } + if (source.equals("nurl")) { + Iterator eiter = pool.noticeURL.entries(true, false, null); + plasmaCrawlNURL.Entry entry; + while (eiter.hasNext()) { + try { + entry = (plasmaCrawlNURL.Entry) eiter.next(); + if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth()); + } catch (Exception e) { + // here a MalformedURLException may occur + // just ignore + } + c++; + if (c % 10000 == 0) System.out.println( + c + " urls checked, " + + doms.size() + " domains collected, " + + ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + + ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining."); + } + } if (format.equals("html")) { // output file in HTML format @@ -999,7 +1019,7 @@ public final class yacy { entry = (Map.Entry) i.next(); key = (String) entry.getKey(); bos.write(("" + key + "" + - ((entry.getValue() == null) ? "" : ((String) entry.getValue())) + "
" + ((entry.getValue() == null) ? "" : (" " + ((String) entry.getValue()))) + "
" ).getBytes()); bos.write(serverCore.crlf); } @@ -1068,6 +1088,22 @@ public final class yacy { } } } + if (source.equals("nurl")) { + Iterator eiter = pool.noticeURL.entries(true, false, null); + plasmaCrawlNURL.Entry entry; + while (eiter.hasNext()) { + entry = (plasmaCrawlNURL.Entry) eiter.next(); + if ((entry != null) && (entry.url() != null)) { + if (html) { + bos.write(("" + entry.url() + " " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "
").getBytes("UTF-8")); + bos.write(serverCore.crlf); + } else { + bos.write(entry.url().toString().getBytes()); + bos.write(serverCore.crlf); + } + } + } + } bos.close(); pool.close(); } catch (IOException e) { @@ -1294,7 +1330,8 @@ public final class yacy { // generate a url list and save it in a file String source = "lurl"; if (args.length >= 3 && args[1].toLowerCase().equals("-source")) { - if ((args[2].equals("lurl")) || + if ((args[2].equals("nurl")) || + (args[2].equals("lurl")) || (args[2].equals("eurl"))) source = args[2]; args = shift(args, 1, 2); @@ -1308,13 +1345,14 @@ public final class yacy { args = shift(args, 1, 2); } if (args.length == 2) applicationRoot= args[1]; - String outfile = "domlist_" + System.currentTimeMillis(); + String outfile = "domlist_" + source + "_" + System.currentTimeMillis(); domlist(applicationRoot, source, format, outfile); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urllist"))) { // generate a url list and save it in a file String source = "lurl"; if (args.length >= 3 && args[1].toLowerCase().equals("-source")) { - if ((args[2].equals("lurl")) || + if ((args[2].equals("nurl")) || + (args[2].equals("lurl")) || (args[2].equals("eurl"))) source = args[2]; args = shift(args, 1, 2); @@ -1325,7 +1363,7 @@ public final class yacy { args = shift(args, 1, 2); } if (args.length == 2) applicationRoot= args[1]; - String outfile = "urllist_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt"); + String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt"); urllist(applicationRoot, source, html, outfile); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { // generate a url list and save it in a file