implemented option to extract nurls from the database

(plus some iteration enhancements for nurls)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2325 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 7fd90ca7c8
commit ad692fc6c7

@ -334,16 +334,14 @@ public class IndexControl_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
final Iterator hashIt = switchboard.urlPool.loadedURL.urlHashes(urlhash, true);
StringBuffer result = new StringBuffer(
"Sequential List of URL-Hashes:<br>");
String hash;
final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>");
plasmaCrawlLURL.Entry entry;
int i = 0;
while (hashIt.hasNext() && i < 256) {
hash = (String) hashIt.next();
result.append("<a href=\"/IndexControl_p.html?").append("urlhash=").append(hash).append("&urlhashsearch=")
.append("\" class=\"tt\">").append(hash).append("</a> ").append(((i + 1) % 8 == 0) ? "<br>" : "");
while (entryIt.hasNext() && i < 256) {
entry = (plasmaCrawlLURL.Entry) entryIt.next();
result.append("<a href=\"/IndexControl_p.html?").append("urlhash=").append(entry.hash()).append("&urlhashsearch=")
.append("\" class=\"tt\">").append(entry.hash()).append("</a> ").append(((i + 1) % 8 == 0) ? "<br>" : "");
i++;
}
prop.put("result", result.toString());

@ -31,7 +31,6 @@ import de.anomic.net.URL;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
@ -536,8 +535,4 @@ public class indexURL {
return hash;
}
public Iterator urlHashes(String urlHash, boolean up) throws IOException {
return urlHashCache.keys(up, false, urlHash.getBytes());
}
}

@ -9,7 +9,6 @@ import java.util.TreeMap;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaCrawlNURL.Entry;
public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImporter {
@ -118,25 +117,25 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
// getting an interator and loop through the URL entries
Iterator iter = (stackTypes[i] == -1)?this.importNurlDB.urlHashes("------------", true):null;
Iterator entryIter = (stackTypes[i] == -1) ? this.importNurlDB.entries(true, false, null) : null;
while (true) {
String nextHash = null;
Entry urlEntry = null;
plasmaCrawlNURL.Entry nextEntry = null;
try {
if (stackTypes[i] != -1) {
if (this.importNurlDB.stackSize(stackTypes[i]) == 0) break;
this.urlCount++;
urlEntry = this.importNurlDB.pop(stackTypes[i]);
nextHash = urlEntry.hash();
nextEntry = this.importNurlDB.pop(stackTypes[i]);
nextHash = nextEntry.hash();
} else {
if (!iter.hasNext()) break;
if (!entryIter.hasNext()) break;
this.urlCount++;
nextHash = (String)iter.next();
urlEntry = this.importNurlDB.getEntry(nextHash);
nextEntry = (plasmaCrawlNURL.Entry) entryIter.next();
nextHash = nextEntry.hash();
}
} catch (IOException e) {
this.log.logWarning("Unable to import entry: " + e.toString());
@ -147,7 +146,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// getting a handler to the crawling profile the url belongs to
try {
String profileHandle = urlEntry.profileHandle();
String profileHandle = nextEntry.profileHandle();
if (profileHandle == null) {
this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown.");
continue;
@ -176,7 +175,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// if the url does not alredy exists in the destination stack we insert it now
if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) {
plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(urlEntry);
plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(nextEntry);
ne.store();
this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
}

@ -46,6 +46,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import de.anomic.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
@ -497,23 +498,9 @@ public class plasmaCrawlNURL extends indexURL {
this.hash = hash;
kelondroRow.Entry entry = urlHashCache.get(hash.getBytes());
if (entry != null) {
//try {
this.initiator = entry.getColString(1, null);
this.url = new URL(entry.getColString(2, null).trim());
this.referrer = (entry.empty(3)) ? dummyHash : entry.getColString(3, null);
this.name = (entry.empty(4)) ? "" : entry.getColString(4, null).trim();
this.loaddate = new Date(86400000 * entry.getColLongB64E(5));
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
this.depth = (int) entry.getColLongB64E(7);
this.anchors = (int) entry.getColLongB64E(8);
this.forkfactor = (int) entry.getColLongB64E(9);
this.flags = new bitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
this.stored = true;
return;
//} catch (MalformedURLException e) {
// throw new IOException("plasmaCrawlNURL/Entry: " + e);
//}
insertEntry(entry);
this.stored = true;
return;
} else {
// show that we found nothing
throw new IOException("NURL: hash " + hash + " not found");
@ -521,6 +508,28 @@ public class plasmaCrawlNURL extends indexURL {
}
}
public Entry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
this.hash = entry.getColString(0, null);
this.initiator = entry.getColString(1, null);
this.url = new URL(entry.getColString(2, null).trim());
this.referrer = (entry.empty(3)) ? dummyHash : entry.getColString(3, null);
this.name = (entry.empty(4)) ? "" : entry.getColString(4, null).trim();
this.loaddate = new Date(86400000 * entry.getColLongB64E(5));
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
this.depth = (int) entry.getColLongB64E(7);
this.anchors = (int) entry.getColLongB64E(8);
this.forkfactor = (int) entry.getColLongB64E(9);
this.flags = new bitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
return;
}
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
@ -616,4 +625,40 @@ public class plasmaCrawlNURL extends indexURL {
}
}
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
boolean error = false;
public kiter(boolean up, boolean rotating, String firstHash) throws IOException {
i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(e);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
public void remove() {
i.remove();
}
}
public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, rotating, firstHash);
}
}

@ -80,6 +80,7 @@ import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool;
import de.anomic.plasma.plasmaWordIndex;
@ -941,7 +942,7 @@ public final class yacy {
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
System.out.println("java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ <path to DATA folder> ]");
System.out.println("java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ <path to DATA folder> ]");
int c = 0;
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
@ -982,6 +983,25 @@ public final class yacy {
((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("nurl")) {
Iterator eiter = pool.noticeURL.entries(true, false, null);
plasmaCrawlNURL.Entry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlNURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth());
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (format.equals("html")) {
// output file in HTML format
@ -999,7 +1019,7 @@ public final class yacy {
entry = (Map.Entry) i.next();
key = (String) entry.getKey();
bos.write(("<a href=\"http://" + key + "\">" + key + "</a>" +
((entry.getValue() == null) ? "" : ((String) entry.getValue())) + "<br>"
((entry.getValue() == null) ? "" : (" " + ((String) entry.getValue()))) + "<br>"
).getBytes());
bos.write(serverCore.crlf);
}
@ -1068,6 +1088,22 @@ public final class yacy {
}
}
}
if (source.equals("nurl")) {
Iterator eiter = pool.noticeURL.entries(true, false, null);
plasmaCrawlNURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlNURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "<br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(serverCore.crlf);
}
}
}
}
bos.close();
pool.close();
} catch (IOException e) {
@ -1294,7 +1330,8 @@ public final class yacy {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
if ((args[2].equals("lurl")) ||
if ((args[2].equals("nurl")) ||
(args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
@ -1308,13 +1345,14 @@ public final class yacy {
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
String outfile = "domlist_" + System.currentTimeMillis();
String outfile = "domlist_" + source + "_" + System.currentTimeMillis();
domlist(applicationRoot, source, format, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urllist"))) {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
if ((args[2].equals("lurl")) ||
if ((args[2].equals("nurl")) ||
(args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
@ -1325,7 +1363,7 @@ public final class yacy {
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
String outfile = "urllist_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file

Loading…
Cancel
Save