migrated the index export methods from the old metadata to solr. Now

exports are done using solr queries. removed superfluous methods and
servlets.
pull/1/head
Michael Peter Christen 12 years ago
parent 1768c82010
commit 0fe7b6fd3b

@ -124,10 +124,9 @@ public class CrawlResults {
if (post.containsKey("deletedomain")) {
final String domain = post.get("domain", null);
final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
if (hashpart != null) {
sb.index.fulltext().deleteDomain(hashpart, null, false);
ResultURLs.deleteDomain(tabletype, domain, hashpart);
if (domain != null) {
sb.index.fulltext().deleteDomainHostname(domain, null, false);
ResultURLs.deleteDomain(tabletype, domain);
}
}

@ -294,7 +294,7 @@ public class Crawler_p {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 1);
int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
}
}

@ -77,7 +77,6 @@ function updatepage(str) {
<dt class="TableCellDark">Retrieve by URL-Hash:</dt>
<dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
<input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" class="submitready" style="width:240px;"/>
<input type="submit" name="urlhashsimilar" value="Generate List" class="submitready" style="width:240px;"/>
</dd>
</dl>
</fieldset>
@ -132,7 +131,7 @@ function updatepage(str) {
<td>
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<div>
<input type="hidden" name="hashpart" value="#[hashpart]#" />
<input type="hidden" name="domain" value="#[domain]#" />
<input type="hidden" name="lines" value="#[lines]#" />
<input type="submit" name="deletedomain" value="delete all" class="submitready" style="width:240px;"/>
</div>
@ -206,13 +205,6 @@ function updatepage(str) {
<div class="commit">Stored a solr dump to file #[dumpfile]#</div>::
#(/indexdump)#
#(urlhashsimilar)#::<p>Sequential List of URL-Hashes:<br />
#{rows}#
#{cols}#<a href="/IndexControlURLs_p.html?urlhash=#[urlHash]#&amp;urlhashsearch=1" class="tt">#[urlHash]#</a> #{/cols}#<br />
#{/rows}#
</p>
#(/urlhashsimilar)#
#(genUrlProfile)#
::No entry found for URL-hash #[urlhash]#
::<iframe src="/api/yacydoc.html?urlhash=#[urlhash]#" width="100%" height="420" frameborder="0" scrolling="no"></iframe><br />

@ -30,13 +30,15 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.data.WorkTables;
@ -44,7 +46,6 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.RotateIterator;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment;
@ -236,30 +237,6 @@ public class IndexControlURLs_p {
}
}
// generate list
if (post.containsKey("urlhashsimilar")) {
final Iterator<DigestURI> entryIt = new RotateIterator<DigestURI>(segment.fulltext().urls(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
DigestURI entry;
int i = 0, rows = 0, cols = 0;
prop.put("urlhashsimilar", "1");
while (entryIt.hasNext() && i < 256) {
entry = entryIt.next();
if (entry == null) break;
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", ASCII.String(entry.hash()));
cols++;
if (cols==8) {
prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
cols = 0;
rows++;
}
i++;
}
prop.put("statistics", 0);
prop.put("urlhashsimilar_rows", rows);
prop.put("result", result.toString());
}
if (post.containsKey("lurlexport")) {
// parse format
int format = 0;
@ -279,7 +256,7 @@ public class IndexControlURLs_p {
final File f = new File(s);
f.getParentFile().mkdirs();
final String filter = post.get("exportfilter", ".*");
final Fulltext.Export running = segment.fulltext().export(f, filter, null, format, dom);
final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom);
prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count());
@ -301,29 +278,29 @@ public class IndexControlURLs_p {
}
if (post.containsKey("deletedomain")) {
final String hp = post.get("hashpart");
segment.fulltext().deleteDomain(hp, null, false);
final String domain = post.get("domain");
segment.fulltext().deleteDomainHostname(domain, null, false);
// trigger the loading of the table
post.put("statistics", "");
}
if (post.containsKey("statistics")) {
final int count = post.getInt("lines", 100);
Iterator<Fulltext.HostStat> statsiter;
prop.put("statistics_lines", count);
int cnt = 0;
try {
final Fulltext metadata = segment.fulltext();
statsiter = metadata.statistics(count, metadata.urlSampleScores(metadata.domainSampleCollector()));
Map<String, ReversibleScoreMap<String>> scores = metadata.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", count, YaCySchema.host_s.getSolrFieldName());
ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
Iterator<String> statsiter = stats.keys(false);
boolean dark = true;
Fulltext.HostStat hs;
String hostname;
prop.put("statisticslines_domains_" + cnt + "lines", count);
while (statsiter.hasNext() && cnt < count) {
hs = statsiter.next();
hostname = statsiter.next();
prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname + ((hs.port == 80) ? "" : ":" + hs.port));
prop.put("statisticslines_domains_" + cnt + "lines", count);
prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash);
prop.put("statisticslines_domains_" + cnt + "_count", hs.count);
prop.put("statisticslines_domains_" + cnt + "_domain", hostname);
prop.put("statisticslines_domains_" + cnt + "_count", stats.get(hostname));
dark = !dark;
cnt++;
}

@ -13,13 +13,4 @@
#(indexdump)#::
<dumpfile>#[dumpfile]#</dumpfile>::
#(/indexdump)#
#(urlhashsimilar)#::
<urls>
#{rows}#
#{cols}#
<urlhash>#[urlHash]#</urlhash>
#{/cols}#
#{/rows}#
</urls>
#(/urlhashsimilar)#
</data>

@ -1,70 +0,0 @@
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainerCache;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.peers.graphics.WebStructureGraph.HostReference;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Fulltext.HostStat;
import net.yacy.search.index.Segment;
import net.yacy.search.ranking.BlockRank;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.server.servletProperties;
public class YBRFetch_p
{
public static servletProperties respond(
@SuppressWarnings("unused") final RequestHeader requestHeader,
final serverObjects post,
final serverSwitch env) {
final servletProperties prop = new servletProperties();
final Switchboard sb = (Switchboard) env;
if ( post == null || !post.containsKey("ghrt4") || MemoryControl.available() < 1024L * 1024L * 1024L ) {
return prop;
}
final File hostIndexFile = new File(sb.queuesRoot, "hostIndex.blob");
ReferenceContainerCache<HostReference> hostIndex; // this will get large, more than 0.5 million entries by now
if ( !hostIndexFile.exists() ) {
hostIndex = BlockRank.collect(sb.peers, sb.webStructure, Integer.MAX_VALUE);
BlockRank.saveHostIndex(hostIndex, hostIndexFile);
} else {
hostIndex = BlockRank.loadHostIndex(hostIndexFile);
}
// use an index segment to find hosts for given host hashes
final Segment segment = sb.index;
final Fulltext metadata = segment.fulltext();
Map<String, HostStat> hostHashResolver;
try {
hostHashResolver = metadata.domainHashResolver(metadata.domainSampleCollector());
} catch ( final IOException e ) {
hostHashResolver = new HashMap<String, HostStat>();
}
// recursively compute a new ranking table
Log.logInfo("BLOCK RANK", "computing new ranking tables...");
BlockRank.ybrTables = BlockRank.evaluate(hostIndex, hostHashResolver, null, 0);
hostIndex = null; // we don't need that here any more, so free the memory
// use the web structure and the hostHash resolver to analyse the ranking table
Log.logInfo("BLOCK RANK", "analysis of " + BlockRank.ybrTables.length + " tables...");
BlockRank.analyse(sb.webStructure, hostHashResolver);
// store the new table
Log.logInfo("BLOCK RANK", "storing fresh table...");
final File rankingPath = new File(sb.appPath, "ranking/YBR".replace('/', File.separatorChar));
BlockRank.storeBlockRankTable(rankingPath);
BlockRank.loadBlockRankTable(rankingPath, 16);
return prop;
}
}

@ -143,17 +143,8 @@ public final class ResultURLs {
return getDomains(stack).keys(false);
}
public static int deleteDomain(final EventOrigin stack, final String host, final String hosthash) {
public static int deleteDomain(final EventOrigin stack, final String host) {
assert host != null : "host = null";
assert hosthash.length() == 6;
final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
Map.Entry<String, InitExecEntry> w;
String urlhash;
while (i.hasNext()) {
w = i.next();
urlhash = w.getKey();
if (urlhash == null || urlhash.substring(6).equals(hosthash)) i.remove();
}
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).delete(host);
}

@ -34,9 +34,9 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter;
@ -49,8 +49,8 @@ import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.storage.ZIPReader;
import net.yacy.cora.storage.ZIPWriter;
import net.yacy.document.parser.html.CharacterCoding;
@ -64,15 +64,15 @@ import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.MergeIterator;
import net.yacy.search.Switchboard;
import org.apache.commons.httpclient.util.DateUtil;
import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
public final class Fulltext implements Iterable<byte[]> {
public final class Fulltext {
private static final String SOLR_PATH = "solr_40"; // the number should be identical to the number in the property luceneMatchVersion in solrconfig.xml
private static final String SOLR_OLD_PATH[] = new String[]{"solr_36"};
@ -359,7 +359,7 @@ public final class Fulltext implements Iterable<byte[]> {
* @return number of deleted domains
* @throws IOException
*/
public int deleteDomain(final String hosthash, Date freshdate, boolean concurrent) {
public int deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
@ -412,6 +412,38 @@ public final class Fulltext implements Iterable<byte[]> {
return count.get();
}
public int deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
if (count.get() > 0) Fulltext.this.solr.commit(true);
} catch (IOException e) {}
}
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hostname.equals(hostname)) {
hsi.remove();
break;
}
}
}
}
};
if (concurrent) t.start(); else t.run();
return count.get();
}
/**
* remove a full subpath from the index
* @param subpath the left path of the url; at least until the end of the host
@ -510,96 +542,6 @@ public final class Fulltext implements Iterable<byte[]> {
if (reason == null) return null;
return reason == null ? null : reason.length() == 0 ? null : reason;
}
@Override
public Iterator<byte[]> iterator() {
CloneableIterator<byte[]> a = null;
if (this.urlIndexFile != null) try {a = this.urlIndexFile.keys(true, null);} catch (IOException e) {}
final Iterator<String> idi = this.solr.iterator();
CloneableIterator<byte[]> b = new CloneableIterator<byte[]>() {
@Override
public boolean hasNext() {
return idi.hasNext();
}
@Override
public byte[] next() {
String s = idi.next();
return s == null ? null : ASCII.getBytes(s);
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public CloneableIterator<byte[]> clone(Object modifier) {
return this;
}
@Override
public void close() {
}
};
if (a == null) return b;
return new MergeIterator<byte[]>(a, b,
URIMetadataRow.rowdef.objectOrder,
MergeIterator.simpleMerge,
true);
}
public CloneableIterator<DigestURI> urls() {
// enumerates entry elements
final Iterator<byte[]> ids = iterator();
return new CloneableIterator<DigestURI>() {
@Override
public CloneableIterator<DigestURI> clone(final Object secondHash) {
return this;
}
@Override
public final boolean hasNext() {
return ids.hasNext();
}
@Override
public final DigestURI next() {
byte[] id = ids.next();
if (id == null) return null;
return getURL(id);
}
@Override
public final void remove() {
ids.remove();
}
@Override
public void close() {
}
};
}
public CloneableIterator<URIMetadataNode> entries() {
// enumerates entry elements
final Iterator<byte[]> ids = iterator();
return new CloneableIterator<URIMetadataNode>() {
@Override
public CloneableIterator<URIMetadataNode> clone(final Object secondHash) {
return this;
}
@Override
public final boolean hasNext() {
return ids.hasNext();
}
@Override
public final URIMetadataNode next() {
byte[] id = ids.next();
if (id == null) return null;
return getMetadata(id);
}
@Override
public final void remove() {
ids.remove();
}
@Override
public void close() {
}
};
}
public List<File> dumpFiles() {
EmbeddedSolrConnector esc = (EmbeddedSolrConnector) this.solr.getSolr0();
@ -675,12 +617,12 @@ public final class Fulltext implements Iterable<byte[]> {
}
// export methods
public Export export(final File f, final String filter, final HandleSet set, final int format, final boolean dom) {
public Export export(final File f, final String filter, final int format, final boolean dom) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) {
Log.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
return this.exportthread;
}
this.exportthread = new Export(f, filter, set, format, dom);
this.exportthread = new Export(f, filter, format, dom);
this.exportthread.start();
return this.exportthread;
}
@ -691,22 +633,20 @@ public final class Fulltext implements Iterable<byte[]> {
public class Export extends Thread {
private final File f;
private final String filter;
private final Pattern pattern;
private int count;
private String failure;
private final int format;
private final boolean dom;
private final HandleSet set;
private Export(final File f, final String filter, final HandleSet set, final int format, boolean dom) {
private Export(final File f, final String filter, final int format, boolean dom) {
// format: 0=text, 1=html, 2=rss/xml
this.f = f;
this.filter = filter;
this.pattern = filter == null ? null : Pattern.compile(filter);
this.count = 0;
this.failure = null;
this.format = format;
this.dom = dom;
this.set = set;
if ((dom) && (format == 2)) dom = false;
}
@ -724,43 +664,54 @@ public final class Fulltext implements Iterable<byte[]> {
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
pw.println("<channel>");
pw.println("<title>YaCy Peer-to-Peer - Web-Search LURL Export</title>");
pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
pw.println("<description></description>");
pw.println("<link>http://yacy.net</link>");
}
if (this.dom) {
final TreeSet<String> set = domainNameCollector(-1, domainSampleCollector());
for (final String host: set) {
if (!host.matches(this.filter)) continue;
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", 100000, YaCySchema.host_s.getSolrFieldName());
ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
for (final String host: stats) {
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
if (this.format == 0) pw.println(host);
if (this.format == 1) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
this.count++;
}
} else {
final Iterator<URIMetadataNode> i = entries(); // iterates indexURLEntry objects
URIMetadataNode entry;
String url;
while (i.hasNext()) {
entry = i.next();
if (this.set != null && !this.set.has(entry.hash())) continue;
url = entry.url().toNormalform(true);
if (!url.matches(this.filter)) continue;
BlockingQueue<SolrDocument> docs = Fulltext.this.getSolr().concurrentQuery(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName(), YaCySchema.title.getSolrFieldName(),
YaCySchema.author.getSolrFieldName(), YaCySchema.description.getSolrFieldName(), YaCySchema.size_i.getSolrFieldName(), YaCySchema.last_modified.getSolrFieldName());
SolrDocument doc;
ArrayList<?> title;
String url, author, description, hash;
Integer size;
Date date;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
hash = (String) doc.getFieldValue(YaCySchema.id.getSolrFieldName());
url = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
title = (ArrayList<?>) doc.getFieldValue(YaCySchema.title.getSolrFieldName());
author = (String) doc.getFieldValue(YaCySchema.author.getSolrFieldName());
description = (String) doc.getFieldValue(YaCySchema.description.getSolrFieldName());
size = (Integer) doc.getFieldValue(YaCySchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(YaCySchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
if (this.format == 0) {
pw.println(url);
}
if (this.format == 1) {
pw.println("<a href=\"" + url + "\">" + CharacterCoding.unicode2xml(entry.dc_title(), true) + "</a><br>");
if (title != null) pw.println("<a href=\"" + MultiProtocolURI.escape(url) + "\">" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</a>");
}
if (this.format == 2) {
pw.println("<item>");
pw.println("<title>" + CharacterCoding.unicode2xml(entry.dc_title(), true) + "</title>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>");
pw.println("<link>" + MultiProtocolURI.escape(url) + "</link>");
if (!entry.dc_creator().isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(entry.dc_creator(), true) + "</author>");
if (!entry.dc_subject().isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(entry.dc_subject(), true) + "</description>");
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
pw.println("<yacy:size>" + entry.size() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + ASCII.String(entry.hash()) + "</guid>");
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
if (date != null) pw.println("<pubDate>" + DateUtil.formatDate(date) + "</pubDate>");
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
pw.println("</item>");
}
this.count++;
@ -798,60 +749,6 @@ public final class Fulltext implements Iterable<byte[]> {
}
/**
* collect domain samples: all url hashes from the metadata database is listed and the domain part
* of the url hashes is used to count how many of these domain hashes appear
* @return a map from domain hashes to hash statistics
* @throws IOException
*/
public Map<String, URLHashCounter> domainSampleCollector() throws IOException {
final Map<String, URLHashCounter> map = new HashMap<String, URLHashCounter>();
// first collect all domains and calculate statistics about it
synchronized (this) {
final Iterator<byte[]> i = this.iterator();
String hosthash;
byte[] urlhashb;
URLHashCounter ds;
if (i != null) while (i.hasNext()) {
urlhashb = i.next();
hosthash = ASCII.String(urlhashb, 6, 6);
ds = map.get(hosthash);
if (ds == null) {
ds = new URLHashCounter(urlhashb);
map.put(hosthash, ds);
} else {
ds.count++;
}
}
}
return map;
}
/**
* create a list of domain names in this database
* @param count number of entries or -1 for all
* @param domainSamples a map from domain hashes to hash statistics
* @return a set of domain names, ordered by name of the domains
*/
private TreeSet<String> domainNameCollector(int count, final Map<String, URLHashCounter> domainSamples) {
// collect hashes from all domains
// fetch urls from the database to determine the host in clear text
DigestURI url;
if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
this.statsDump = new ArrayList<HostStat>();
final TreeSet<String> set = new TreeSet<String>();
for (final URLHashCounter hs: domainSamples.values()) {
if (hs == null) continue;
url = this.getURL(hs.urlhashb);
if (url == null || url.getHost() == null) continue;
set.add(url.getHost());
count--;
if (count == 0) break;
}
return set;
}
/**
* calculate a score map for url hash samples: each sample is a single url hash
* that stands for all entries for the corresponding domain. The map counts the number

@ -246,7 +246,8 @@ public class QueryGoal {
q.append(')');
// add filter to prevent that results come from failed urls
q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
q.append(" AND ").append(YaCySchema.httpstatus_i.getSolrFieldName()).append(":200");
//q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
return q;
}

Loading…
Cancel
Save