migrated the index export methods from the old metadata to solr. Now

exports are done using solr queries. removed superfluous methods and
Michael Peter Christen 12 years ago
parent 1768c82010
commit 0fe7b6fd3b

@ -124,10 +124,9 @@ public class CrawlResults {
if (post.containsKey("deletedomain")) {
final String domain = post.get("domain", null);
final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
if (hashpart != null) {
sb.index.fulltext().deleteDomain(hashpart, null, false);
ResultURLs.deleteDomain(tabletype, domain, hashpart);
if (domain != null) {
sb.index.fulltext().deleteDomainHostname(domain, null, false);
ResultURLs.deleteDomain(tabletype, domain);

@ -294,7 +294,7 @@ public class Crawler_p {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 1);
int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());

@ -77,7 +77,6 @@ function updatepage(str) {
<dt class="TableCellDark">Retrieve by URL-Hash:</dt>
<dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
<input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" class="submitready" style="width:240px;"/>
<input type="submit" name="urlhashsimilar" value="Generate List" class="submitready" style="width:240px;"/>
@ -132,7 +131,7 @@ function updatepage(str) {
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<input type="hidden" name="hashpart" value="#[hashpart]#" />
<input type="hidden" name="domain" value="#[domain]#" />
<input type="hidden" name="lines" value="#[lines]#" />
<input type="submit" name="deletedomain" value="delete all" class="submitready" style="width:240px;"/>
@ -206,13 +205,6 @@ function updatepage(str) {
<div class="commit">Stored a solr dump to file #[dumpfile]#</div>::
#(urlhashsimilar)#::<p>Sequential List of URL-Hashes:<br />
#{cols}#<a href="/IndexControlURLs_p.html?urlhash=#[urlHash]#&amp;urlhashsearch=1" class="tt">#[urlHash]#</a> #{/cols}#<br />
::No entry found for URL-hash #[urlhash]#
::<iframe src="/api/yacydoc.html?urlhash=#[urlhash]#" width="100%" height="420" frameborder="0" scrolling="no"></iframe><br />

@ -30,13 +30,15 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.data.WorkTables;
@ -44,7 +46,6 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.RotateIterator;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment;
@ -236,30 +237,6 @@ public class IndexControlURLs_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
final Iterator<DigestURI> entryIt = new RotateIterator<DigestURI>(segment.fulltext().urls(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
DigestURI entry;
int i = 0, rows = 0, cols = 0;
prop.put("urlhashsimilar", "1");
while (entryIt.hasNext() && i < 256) {
entry = entryIt.next();
if (entry == null) break;
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", ASCII.String(entry.hash()));
if (cols==8) {
prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
cols = 0;
prop.put("statistics", 0);
prop.put("urlhashsimilar_rows", rows);
prop.put("result", result.toString());
if (post.containsKey("lurlexport")) {
// parse format
int format = 0;
@ -279,7 +256,7 @@ public class IndexControlURLs_p {
final File f = new File(s);
final String filter = post.get("exportfilter", ".*");
final Fulltext.Export running = segment.fulltext().export(f, filter, null, format, dom);
final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom);
prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count());
@ -301,29 +278,29 @@ public class IndexControlURLs_p {
if (post.containsKey("deletedomain")) {
final String hp = post.get("hashpart");
segment.fulltext().deleteDomain(hp, null, false);
final String domain = post.get("domain");
segment.fulltext().deleteDomainHostname(domain, null, false);
// trigger the loading of the table
post.put("statistics", "");
if (post.containsKey("statistics")) {
final int count = post.getInt("lines", 100);
Iterator<Fulltext.HostStat> statsiter;
prop.put("statistics_lines", count);
int cnt = 0;
try {
final Fulltext metadata = segment.fulltext();
statsiter = metadata.statistics(count, metadata.urlSampleScores(metadata.domainSampleCollector()));
Map<String, ReversibleScoreMap<String>> scores = metadata.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", count, YaCySchema.host_s.getSolrFieldName());
ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
Iterator<String> statsiter = stats.keys(false);
boolean dark = true;
Fulltext.HostStat hs;
String hostname;
prop.put("statisticslines_domains_" + cnt + "lines", count);
while (statsiter.hasNext() && cnt < count) {
hs = statsiter.next();
hostname = statsiter.next();
prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname + ((hs.port == 80) ? "" : ":" + hs.port));
prop.put("statisticslines_domains_" + cnt + "lines", count);
prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash);
prop.put("statisticslines_domains_" + cnt + "_count", hs.count);
prop.put("statisticslines_domains_" + cnt + "_domain", hostname);
prop.put("statisticslines_domains_" + cnt + "_count", stats.get(hostname));
dark = !dark;

@ -13,13 +13,4 @@

@ -1,70 +0,0 @@
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainerCache;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.peers.graphics.WebStructureGraph.HostReference;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Fulltext.HostStat;
import net.yacy.search.index.Segment;
import net.yacy.search.ranking.BlockRank;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.server.servletProperties;
public class YBRFetch_p
public static servletProperties respond(
@SuppressWarnings("unused") final RequestHeader requestHeader,
final serverObjects post,
final serverSwitch env) {
final servletProperties prop = new servletProperties();
final Switchboard sb = (Switchboard) env;
if ( post == null || !post.containsKey("ghrt4") || MemoryControl.available() < 1024L * 1024L * 1024L ) {
return prop;
final File hostIndexFile = new File(sb.queuesRoot, "hostIndex.blob");
ReferenceContainerCache<HostReference> hostIndex; // this will get large, more than 0.5 million entries by now
if ( !hostIndexFile.exists() ) {
hostIndex = BlockRank.collect(sb.peers, sb.webStructure, Integer.MAX_VALUE);
BlockRank.saveHostIndex(hostIndex, hostIndexFile);
} else {
hostIndex = BlockRank.loadHostIndex(hostIndexFile);
// use an index segment to find hosts for given host hashes
final Segment segment = sb.index;
final Fulltext metadata = segment.fulltext();
Map<String, HostStat> hostHashResolver;
try {
hostHashResolver = metadata.domainHashResolver(metadata.domainSampleCollector());
} catch ( final IOException e ) {
hostHashResolver = new HashMap<String, HostStat>();
// recursively compute a new ranking table
Log.logInfo("BLOCK RANK", "computing new ranking tables...");
BlockRank.ybrTables = BlockRank.evaluate(hostIndex, hostHashResolver, null, 0);
hostIndex = null; // we don't need that here any more, so free the memory
// use the web structure and the hostHash resolver to analyse the ranking table
Log.logInfo("BLOCK RANK", "analysis of " + BlockRank.ybrTables.length + " tables...");
BlockRank.analyse(sb.webStructure, hostHashResolver);
// store the new table
Log.logInfo("BLOCK RANK", "storing fresh table...");
final File rankingPath = new File(sb.appPath, "ranking/YBR".replace('/', File.separatorChar));
BlockRank.loadBlockRankTable(rankingPath, 16);
return prop;

@ -143,17 +143,8 @@ public final class ResultURLs {
return getDomains(stack).keys(false);
public static int deleteDomain(final EventOrigin stack, final String host, final String hosthash) {
public static int deleteDomain(final EventOrigin stack, final String host) {
assert host != null : "host = null";
assert hosthash.length() == 6;
final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
Map.Entry<String, InitExecEntry> w;
String urlhash;
while (i.hasNext()) {
w = i.next();
urlhash = w.getKey();
if (urlhash == null || urlhash.substring(6).equals(hosthash)) i.remove();
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).delete(host);

@ -34,9 +34,9 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter;
@ -49,8 +49,8 @@ import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.storage.ZIPReader;
import net.yacy.cora.storage.ZIPWriter;
import net.yacy.document.parser.html.CharacterCoding;
@ -64,15 +64,15 @@ import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.MergeIterator;
import net.yacy.search.Switchboard;
import org.apache.commons.httpclient.util.DateUtil;
import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
public final class Fulltext implements Iterable<byte[]> {
public final class Fulltext {
private static final String SOLR_PATH = "solr_40"; // the number should be identical to the number in the property luceneMatchVersion in solrconfig.xml
private static final String SOLR_OLD_PATH[] = new String[]{"solr_36"};
@ -359,7 +359,7 @@ public final class Fulltext implements Iterable<byte[]> {
* @return number of deleted domains
* @throws IOException
public int deleteDomain(final String hosthash, Date freshdate, boolean concurrent) {
public int deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
@ -412,6 +412,38 @@ public final class Fulltext implements Iterable<byte[]> {
return count.get();
public int deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
if (count.get() > 0) Fulltext.this.solr.commit(true);
} catch (IOException e) {}
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hostname.equals(hostname)) {
if (concurrent) t.start(); else t.run();
return count.get();
* remove a full subpath from the index
* @param subpath the left path of the url; at least until the end of the host
@ -510,96 +542,6 @@ public final class Fulltext implements Iterable<byte[]> {
if (reason == null) return null;
return reason == null ? null : reason.length() == 0 ? null : reason;
public Iterator<byte[]> iterator() {
CloneableIterator<byte[]> a = null;
if (this.urlIndexFile != null) try {a = this.urlIndexFile.keys(true, null);} catch (IOException e) {}
final Iterator<String> idi = this.solr.iterator();
CloneableIterator<byte[]> b = new CloneableIterator<byte[]>() {
public boolean hasNext() {
return idi.hasNext();
public byte[] next() {
String s = idi.next();
return s == null ? null : ASCII.getBytes(s);
public void remove() {
throw new UnsupportedOperationException();
public CloneableIterator<byte[]> clone(Object modifier) {
return this;
public void close() {
if (a == null) return b;
return new MergeIterator<byte[]>(a, b,
public CloneableIterator<DigestURI> urls() {
// enumerates entry elements
final Iterator<byte[]> ids = iterator();
return new CloneableIterator<DigestURI>() {
public CloneableIterator<DigestURI> clone(final Object secondHash) {
return this;
public final boolean hasNext() {
return ids.hasNext();
public final DigestURI next() {
byte[] id = ids.next();
if (id == null) return null;
return getURL(id);
public final void remove() {
public void close() {
public CloneableIterator<URIMetadataNode> entries() {
// enumerates entry elements
final Iterator<byte[]> ids = iterator();
return new CloneableIterator<URIMetadataNode>() {
public CloneableIterator<URIMetadataNode> clone(final Object secondHash) {
return this;
public final boolean hasNext() {
return ids.hasNext();
public final URIMetadataNode next() {
byte[] id = ids.next();
if (id == null) return null;
return getMetadata(id);
public final void remove() {
public void close() {
public List<File> dumpFiles() {
EmbeddedSolrConnector esc = (EmbeddedSolrConnector) this.solr.getSolr0();
@ -675,12 +617,12 @@ public final class Fulltext implements Iterable<byte[]> {
// export methods
public Export export(final File f, final String filter, final HandleSet set, final int format, final boolean dom) {
public Export export(final File f, final String filter, final int format, final boolean dom) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) {
Log.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
return this.exportthread;
this.exportthread = new Export(f, filter, set, format, dom);
this.exportthread = new Export(f, filter, format, dom);
return this.exportthread;
@ -691,22 +633,20 @@ public final class Fulltext implements Iterable<byte[]> {
public class Export extends Thread {
private final File f;
private final String filter;
private final Pattern pattern;
private int count;
private String failure;
private final int format;
private final boolean dom;
private final HandleSet set;
private Export(final File f, final String filter, final HandleSet set, final int format, boolean dom) {
private Export(final File f, final String filter, final int format, boolean dom) {
// format: 0=text, 1=html, 2=rss/xml
this.f = f;
this.filter = filter;
this.pattern = filter == null ? null : Pattern.compile(filter);
this.count = 0;
this.failure = null;
this.format = format;
this.dom = dom;
this.set = set;
if ((dom) && (format == 2)) dom = false;
@ -724,43 +664,54 @@ public final class Fulltext implements Iterable<byte[]> {
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
pw.println("<title>YaCy Peer-to-Peer - Web-Search LURL Export</title>");
pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
if (this.dom) {
final TreeSet<String> set = domainNameCollector(-1, domainSampleCollector());
for (final String host: set) {
if (!host.matches(this.filter)) continue;
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", 100000, YaCySchema.host_s.getSolrFieldName());
ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
for (final String host: stats) {
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
if (this.format == 0) pw.println(host);
if (this.format == 1) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
} else {
final Iterator<URIMetadataNode> i = entries(); // iterates indexURLEntry objects
URIMetadataNode entry;
String url;
while (i.hasNext()) {
entry = i.next();
if (this.set != null && !this.set.has(entry.hash())) continue;
url = entry.url().toNormalform(true);
if (!url.matches(this.filter)) continue;
BlockingQueue<SolrDocument> docs = Fulltext.this.getSolr().concurrentQuery(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName(), YaCySchema.title.getSolrFieldName(),
YaCySchema.author.getSolrFieldName(), YaCySchema.description.getSolrFieldName(), YaCySchema.size_i.getSolrFieldName(), YaCySchema.last_modified.getSolrFieldName());
SolrDocument doc;
ArrayList<?> title;
String url, author, description, hash;
Integer size;
Date date;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
hash = (String) doc.getFieldValue(YaCySchema.id.getSolrFieldName());
url = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
title = (ArrayList<?>) doc.getFieldValue(YaCySchema.title.getSolrFieldName());
author = (String) doc.getFieldValue(YaCySchema.author.getSolrFieldName());
description = (String) doc.getFieldValue(YaCySchema.description.getSolrFieldName());
size = (Integer) doc.getFieldValue(YaCySchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(YaCySchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
if (this.format == 0) {
if (this.format == 1) {
pw.println("<a href=\"" + url + "\">" + CharacterCoding.unicode2xml(entry.dc_title(), true) + "</a><br>");
if (title != null) pw.println("<a href=\"" + MultiProtocolURI.escape(url) + "\">" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</a>");
if (this.format == 2) {
pw.println("<title>" + CharacterCoding.unicode2xml(entry.dc_title(), true) + "</title>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>");
pw.println("<link>" + MultiProtocolURI.escape(url) + "</link>");
if (!entry.dc_creator().isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(entry.dc_creator(), true) + "</author>");
if (!entry.dc_subject().isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(entry.dc_subject(), true) + "</description>");
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
pw.println("<yacy:size>" + entry.size() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + ASCII.String(entry.hash()) + "</guid>");
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
if (date != null) pw.println("<pubDate>" + DateUtil.formatDate(date) + "</pubDate>");
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
@ -798,60 +749,6 @@ public final class Fulltext implements Iterable<byte[]> {
* collect domain samples: all url hashes from the metadata database is listed and the domain part
* of the url hashes is used to count how many of these domain hashes appear
* @return a map from domain hashes to hash statistics
* @throws IOException
public Map<String, URLHashCounter> domainSampleCollector() throws IOException {
final Map<String, URLHashCounter> map = new HashMap<String, URLHashCounter>();
// first collect all domains and calculate statistics about it
synchronized (this) {
final Iterator<byte[]> i = this.iterator();
String hosthash;
byte[] urlhashb;
URLHashCounter ds;
if (i != null) while (i.hasNext()) {
urlhashb = i.next();
hosthash = ASCII.String(urlhashb, 6, 6);
ds = map.get(hosthash);
if (ds == null) {
ds = new URLHashCounter(urlhashb);
map.put(hosthash, ds);
} else {
return map;
* create a list of domain names in this database
* @param count number of entries or -1 for all
* @param domainSamples a map from domain hashes to hash statistics
* @return a set of domain names, ordered by name of the domains
private TreeSet<String> domainNameCollector(int count, final Map<String, URLHashCounter> domainSamples) {
// collect hashes from all domains
// fetch urls from the database to determine the host in clear text
DigestURI url;
if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
this.statsDump = new ArrayList<HostStat>();
final TreeSet<String> set = new TreeSet<String>();
for (final URLHashCounter hs: domainSamples.values()) {
if (hs == null) continue;
url = this.getURL(hs.urlhashb);
if (url == null || url.getHost() == null) continue;
if (count == 0) break;
return set;
* calculate a score map for url hash samples: each sample is a single url hash
* that stands for all entries for the corresponding domain. The map counts the number

@ -246,7 +246,8 @@ public class QueryGoal {
// add filter to prevent that results come from failed urls
q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
q.append(" AND ").append(YaCySchema.httpstatus_i.getSolrFieldName()).append(":200");
//q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
return q;
