You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/htroot/IndexBrowser_p.java

764 lines
46 KiB

/**
* IndexBrowser
* Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 27.09.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.htroot;
import static net.yacy.repository.BlacklistHelper.addBlacklistEntry;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.peers.graphics.WebStructureGraph.StructureEntry;
import net.yacy.repository.Blacklist;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
introduced a second core named 'webgraph'. This core will hold the link structure, but is not filled yet. To have the opportunity of a second core, multi-core functionality had to be implemented to the deep-embedded solr: - migrated the solr_40 directory content to a subdirectory 'collection1'; the previously used default core is now called collection1 - added solr_40/webgraph subdirectory as second core - added a servlet configuration for the second core 'webgraph' in /IndexSchema_p.html - added instance handling as addition to solr connections: all solr connectors are now instances of an solr 'instance' object; this required a complete re-design of the solr embedding - migrated also caching and sharding ontop of new instance handling - migrated the search apis to handle now the access to a specific core, the default core named 'collection1' - migrated the remote solr search interface to access shards of cores; for the yacy remote search the default core is now called 'solr'; using the peer address as solr address - migrated the solr backup and restore process: old backups cannot be used after this migration! - redesign of solr instance handling in all methods which access the instances: they cannot hold copies of these instances any more; the must retrieve the actuall connection object every time they want to write to it (this solves also some bugs when switching the index/network) - added another schema 'solr.webgraph.schema', the old solr.keys.list is replaced by solr.collection.schema
12 years ago
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
/**
* Browser for indexed resources
*/
public class IndexBrowser_p {
final static long TIMEOUT = 10000L;
public static enum StoreType {
LINK, INDEX, EXCLUDED, FAILED, RELOAD;
}
/**
* <p>Retrieve local index entries for a path, or for hosts with the most references. Also allow some maintaining operations on entries with load errors.</p>
* <p>Some parameters need administrator authentication or unauthenticated local host requests to be allowed : load, deleteLoadErrors, delete, reload404,
* hosts="crawling" and hosts="error".
* The "load" parameter can also be applied without authentication when "browser.load4everyone" configuration setting is true.</p>
* <p>
* Configuration settings :
* <ul>
* <li>browser.autoload : allow the administrator to stack URLs to the local crawl queue, manually with the "load" parameter,
* or automatically when the "path" parameter is filled with an unknown URL</li>
* <li>browser.load4everyone : allow everyone to stack URLs to the local crawl queue.
* "browser.autoload" has also to be set to true to enable automatic loading on an unknown path</li>
* <li>publicSearchpage : set to false to restrict use of this servlet to authenticated administrator only</li>
* <li>decoration.hostanalysis : add supplementary hosts information for debug/analysis purpose</li>
* <li>decoration.grafics.linkstructure : display a link structure graph when the path parameter is filled</li>
* </ul>
* </p>
* @param header servlet request header
* @param post request parameters. Supported keys :<ul>
* <li>path : root URL or host name to browse (ignored when the hosts parameter is filled). When not yet locally indexed, this URL can be automatically crawled and indexed
* when "browser.autoload" or "browser.load4everyone" configuration settings are set to true.</li>
* <li>load : URL to crawl and index.</li>
* <li>deleteLoadErrors : delete from the local index documents with load error (HTTP status different from 200 or any other failure).</li>
* <li>hosts : generate hosts with most references list. Supported values :
* <ul>
* <li>"crawling" : restrict to host currently crawled</li>
* <li>"error" : restrict to hosts with having at least one resource load error</li>
* </ul>
* </li>
* <li>delete : delete from the index whole documents tree matching the path prefix</li>
* <li>reload404 : reload documents matching the path prefix and which previously failed to load due to a network error</li>
* <li>facetcount : </li>
* <li>complete : we want only root paths for complete lists</li>
* <li>nepr :</li>
* <li>showlinkstructure : when present, display a link graph for path</li>
* </ul>
* @param env server environment
* @return the servlet answer object
*/
@SuppressWarnings({ "unchecked" })
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final Fulltext fulltext = sb.index.fulltext();
final boolean autoload = sb.getConfigBool("browser.autoload", true);
final boolean load4everyone = sb.getConfigBool("browser.load4everyone", false);
final boolean loadRight = autoload || load4everyone; // add config later
final serverObjects prop = new serverObjects();
// set default values
prop.put("path", "");
prop.put("result", "");
prop.put("hosts", 0);
prop.put("files", 0);
prop.put("hostanalysis", 0);
String path = post == null ? "" : post.get("path", "").trim();
sb.index.fulltext().commit(true);
if (post == null || env == null) {
prop.putNum("ucount", fulltext.collectionSize());
return prop;
}
int p = path.lastIndexOf('/');
if (p < 0 && path.length() > 0) path = path + "/"; else if (p > 7) path = path.substring(0, p + 1); // the search path shall always end with "/"
if (path.length() > 0 && (
!path.startsWith("http://") &&
!path.startsWith("https://") &&
!path.startsWith("ftp://") &&
!path.startsWith("smb://") &&
!path.startsWith("file://"))) { path = "http://" + path; }
prop.putHTML("path", path);
prop.put("delete", path.length() > 0 ? 1 : 0);
DigestURL pathURI = null;
try {pathURI = new DigestURL(path);} catch (final MalformedURLException e) {}
String load = post.get("load", "");
boolean wait = false;
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(ASCII.String(pathURI.hash()))) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
load = path;
wait = true;
}
if (load.length() > 0 && loadRight) {
// stack URL
DigestURL url;
if (sb.crawlStacker.size() > 2) wait = false;
try {
url = new DigestURL(load);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
url, null, load, new Date(),
sb.crawler.defaultProxyProfile.handle(),
0,
sb.crawler.defaultProxyProfile.timezoneOffset()
));
11 years ago
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) for (int i = 0; i < 30; i++) {
if (sb.index.exists(ASCII.String(url.hash()))) break;
try {Thread.sleep(100);} catch (final InterruptedException e) {}
}
} catch (final MalformedURLException e) {
11 years ago
prop.putHTML("result", "bad url '" + load + "'");
}
}
if (post.containsKey("deleteLoadErrors")) {
try {
fulltext.getDefaultConnector().deleteByQuery("-" + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND "
+ CollectionSchema.httpstatus_i.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); // make sure field exists
ConcurrentLog.info ("IndexBrowser_p:", "delete documents with httpstatus_i <> 200");
fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failtype_s.getSolrFieldName() + ":\"" + FailType.fail.name() + "\"" );
ConcurrentLog.info ("IndexBrowser_p:", "delete documents with failtype_s = fail");
fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failtype_s.getSolrFieldName() + ":\"" + FailType.excl.name() + "\"" );
ConcurrentLog.info ("IndexBrowser_p:", "delete documents with failtype_s = excl");
prop.putNum("ucount", fulltext.collectionSize());
return prop;
} catch (final IOException ex) {
ConcurrentLog.logException(ex);
}
}
if (post.containsKey("hosts")) {
// generate host list
try {
final boolean onlyCrawling = "crawling".equals(post.get("hosts", ""));
final boolean onlyErrors = "error".equals(post.get("hosts", ""));
final int maxcount = 2 * 3 * 2 * 5 * 7 * 2 * 3; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums
// collect hosts from index
ReversibleScoreMap<String> hostscore = fulltext.getDefaultConnector().getFacets(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", maxcount, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
added a new way of content browsing in search results: - date navigation The date is taken from the CONTENT of the documents / web pages, NOT from a date submitted in the context of metadata (i.e. http header or html head form). This makes it possible to search for documents in the future, i.e. when documents contain event descriptions for future events. The date is written to an index field which is now enabled by default. All documents are scanned for contained date mentions. To visualize the dates for a specific search results, a histogram showing the number of documents for each day is displayed. To render these histograms the morris.js library is used. Morris.js requires also raphael.js which is now also integrated in YaCy. The histogram is now also displayed in the index browser by default. To select a specific range from a search result, the following modifiers had been introduced: from:<date> to:<date> These modifiers can be used separately (i.e. only 'from' or only 'to') to describe an open interval or combined to have a closed interval. Both dates are inclusive. To select a specific single date only, use the 'to:' - modifier. The histogram shows blue and green lines; the green lines denot weekend days (saturday and sunday). Clicking on bars in the histogram has the following reaction: 1st click: add a from:<date> modifier for the date of the bar 2nd click: add a to:<date> modifier for the date of the bar 3rd click: remove from and date modifier and set a on:<date> for the bar When the on:<date> modifier is used, the histogram shows an unlimited time period. This makes it possible to click again (4th click) which is then interpreted as a 1st click again (sets a from modifier). The display feature is NOT switched on by default; to switch it on use the /ConfigSearchPage_p.html servlet.
10 years ago
if (hostscore == null) hostscore = new ClusteredScoreMap<String>(true);
// collect hosts from crawler
final Map<String, Integer[]> crawler = sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots);
final Map<String, Integer> hostNameToPendingCount = new HashMap<>();
for(final Entry<String, Integer[]>crawlerEntry: crawler.entrySet()) {
/* The local stack returns keys composed of "hostname:port" : we now sum pending URLs counts by host name */
final String hostName = Domains.stripToHostName(crawlerEntry.getKey());
Integer pendingCount = hostNameToPendingCount.get(hostName);
if(pendingCount == null) {
pendingCount = 0;
}
pendingCount += crawlerEntry.getValue()[0];
hostNameToPendingCount.put(hostName, pendingCount);
}
// collect the errorurls
final Map<String, ReversibleScoreMap<String>> exclfacets = fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.excl.name(), maxcount, CollectionSchema.host_s.getSolrFieldName());
final ReversibleScoreMap<String> exclscore = exclfacets == null ? new ClusteredScoreMap<String>(true) : exclfacets.get(CollectionSchema.host_s.getSolrFieldName());
final Map<String, ReversibleScoreMap<String>> failfacets = fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.fail.name(), maxcount, CollectionSchema.host_s.getSolrFieldName());
final ReversibleScoreMap<String> failscore = failfacets == null ? new ClusteredScoreMap<String>(true) : failfacets.get(CollectionSchema.host_s.getSolrFieldName());
int c = 0;
final Iterator<String> i = hostscore.keys(false);
String host;
while (i.hasNext() && c < maxcount) {
host = i.next();
11 years ago
prop.putHTML("hosts_list_" + c + "_host", host);
final boolean inCrawler = hostNameToPendingCount.containsKey(host);
final int exclcount = exclscore.get(host);
final int failcount = failscore.get(host);
final int errors = exclcount + failcount;
prop.put("hosts_list_" + c + "_count", hostscore.get(host));
prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0);
if (inCrawler) {
prop.put("hosts_list_" + c + "_crawler_pending", hostNameToPendingCount.get(host));
}
prop.put("hosts_list_" + c + "_errors", errors > 0 ? 1 : 0);
if (errors > 0) {
prop.put("hosts_list_" + c + "_errors_exclcount", exclcount);
prop.put("hosts_list_" + c + "_errors_failcount", failcount);
}
prop.put("hosts_list_" + c + "_type", inCrawler ? 2 : errors > 0 ? 1 : 0);
if (onlyCrawling) {
if (inCrawler) c++;
} else if (onlyErrors) {
if (errors > 0) c++;
} else {
c++;
}
}
prop.put("hosts_list", c);
prop.put("hosts_authorized", 1);
prop.put("hosts", 1);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
if (path.length() > 0) {
try {
DigestURL uri = new DigestURL(path);
String host = uri.getHost();
// write host analysis if path after host is empty
if (uri.getPath().length() <= 1 && host != null && host.length() > 0 && sb.getConfigBool("decoration.hostanalysis", false)) {
//how many documents per crawldepth_i; get crawldepth_i facet for host
final ArrayList<String> ff = new ArrayList<>();
for (final CollectionSchema csf: CollectionSchema.values()) {
if ((csf.getType() != SolrType.num_integer && csf.getType() != SolrType.num_long) || csf.isMultiValued()) continue;
final String facetfield = csf.getSolrFieldName();
if (!fulltext.getDefaultConfiguration().contains(facetfield)) continue;
ff.add(csf.getSolrFieldName());
}
// add also vocabulary counters
final Map<String, ReversibleScoreMap<String>> vocabularyFacet = sb.index.fulltext().getDefaultConnector().getFacets(CollectionSchema.vocabularies_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, 100, CollectionSchema.vocabularies_sxt.getSolrFieldName());
if (vocabularyFacet.size() > 0) {
final Collection<String> vocnames = vocabularyFacet.values().iterator().next().keyList(true);
for (final String vocname: vocnames) {
ff.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX);
ff.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX);
}
}
// list the facets
final String[] facetfields = ff.toArray(new String[ff.size()]);
final Map<String, ReversibleScoreMap<String>> facets = fulltext.getDefaultConnector().getFacets(CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"", 100, facetfields);
int fc = 0;
for (final Map.Entry<String, ReversibleScoreMap<String>> facetentry: facets.entrySet()) {
final ReversibleScoreMap<String> facetfieldmap = facetentry.getValue();
if (facetfieldmap.size() == 0) continue;
final TreeMap<Long, Integer> statMap = new TreeMap<>();
for (final String k: facetfieldmap) statMap.put(Long.parseLong(k), facetfieldmap.get(k));
prop.put("hostanalysis_facets_" + fc + "_facetname", facetentry.getKey());
int c = 0; for (final Entry<Long, Integer> entry: statMap.entrySet()) {
prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_key", entry.getKey());
prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_count", entry.getValue());
prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_a", "solr/collection1/select?q=host_s:" + host + " AND " + facetentry.getKey() + ":" + entry.getKey() + "&defType=edismax&start=0&rows=1000&fl=sku,crawldepth_i");
c++;
}
prop.put("hostanalysis_facets_" + fc + "_facet", c);
fc++;
}
prop.put("hostanalysis_facets", fc);
prop.put("hostanalysis", 1);
}
// write file list for subpath
boolean delete = false;
boolean reload404 = false;
if (post.containsKey("delete")) {
// delete the complete path!! That includes everything that matches with this prefix.
delete = true;
}
if (post.containsKey("reload404")) {
// try to re-load all urls that have load errors and matches with this prefix.
reload404 = true;
}
final int facetcount = post.getInt("facetcount", 0);
final boolean complete = post.getBoolean("complete");
if (complete) { // we want only root paths for complete lists
p = path.indexOf('/', 10);
if (p > 0) path = path.substring(0, p + 1);
}
prop.put("files_complete", complete ? 1 : 0);
prop.putHTML("files_complete_path", path);
p = path.substring(0, path.length() - 1).lastIndexOf('/');
if (p < 8) {
prop.put("files_root", 1);
} else {
prop.put("files_root", 0);
prop.putHTML("files_root_path", path.substring(0, p + 1));
}
// generate file list from path
prop.putHTML("outbound_host", host);
prop.putHTML("inbound_host", host);
final String hosthash = uri.hosthash();
final String[] pathparts = uri.getPaths();
// get all files for a specific host from the index
final StringBuilder q = new StringBuilder();
if (host == null) {
if (path.startsWith("file://")) {
q.append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(":file");
}
} else {
q.append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(host).append("\"");
}
if (pathparts.length > 0 && pathparts[0].length() > 0) {
for (final String pe: pathparts) {
introduced a second core named 'webgraph'. This core will hold the link structure, but is not filled yet. To have the opportunity of a second core, multi-core functionality had to be implemented to the deep-embedded solr: - migrated the solr_40 directory content to a subdirectory 'collection1'; the previously used default core is now called collection1 - added solr_40/webgraph subdirectory as second core - added a servlet configuration for the second core 'webgraph' in /IndexSchema_p.html - added instance handling as addition to solr connections: all solr connectors are now instances of an solr 'instance' object; this required a complete re-design of the solr embedding - migrated also caching and sharding ontop of new instance handling - migrated the search apis to handle now the access to a specific core, the default core named 'collection1' - migrated the remote solr search interface to access shards of cores; for the yacy remote search the default core is now called 'solr'; using the peer address as solr address - migrated the solr backup and restore process: old backups cannot be used after this migration! - redesign of solr instance handling in all methods which access the instances: they cannot hold copies of these instances any more; the must retrieve the actuall connection object every time they want to write to it (this solves also some bugs when switching the index/network) - added another schema 'solr.webgraph.schema', the old solr.keys.list is replaced by solr.collection.schema
12 years ago
if (pe.length() > 0) q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(":\"").append(pe).append('\"');
}
} else {
if (facetcount > 1000 || post.containsKey("nepr")) {
q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
}
}
// if reload errors is requested, limit results to pages with some failreason set
if(reload404) q.append(" AND ").append(CollectionSchema.failtype_s.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
final int pageSize = 100;
final BlockingQueue<SolrDocument> docs = new ArrayBlockingQueue<>(pageSize);
final List<String> queries = new ArrayList<>();
queries.add(q.toString());
final Thread solrQueryTask = new Thread(fulltext.getDefaultConnector().newDocumentsByQueriesTask(docs, queries, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, pageSize, 1,
introduced a second core named 'webgraph'. This core will hold the link structure, but is not filled yet. To have the opportunity of a second core, multi-core functionality had to be implemented to the deep-embedded solr: - migrated the solr_40 directory content to a subdirectory 'collection1'; the previously used default core is now called collection1 - added solr_40/webgraph subdirectory as second core - added a servlet configuration for the second core 'webgraph' in /IndexSchema_p.html - added instance handling as addition to solr connections: all solr connectors are now instances of an solr 'instance' object; this required a complete re-design of the solr embedding - migrated also caching and sharding ontop of new instance handling - migrated the search apis to handle now the access to a specific core, the default core named 'collection1' - migrated the remote solr search interface to access shards of cores; for the yacy remote search the default core is now called 'solr'; using the peer address as solr address - migrated the solr backup and restore process: old backups cannot be used after this migration! - redesign of solr instance handling in all methods which access the instances: they cannot hold copies of these instances any more; the must retrieve the actuall connection object every time they want to write to it (this solves also some bugs when switching the index/network) - added another schema 'solr.webgraph.schema', the old solr.keys.list is replaced by solr.collection.schema
12 years ago
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.failreason_s.getSolrFieldName(),
introduced a second core named 'webgraph'. This core will hold the link structure, but is not filled yet. To have the opportunity of a second core, multi-core functionality had to be implemented to the deep-embedded solr: - migrated the solr_40 directory content to a subdirectory 'collection1'; the previously used default core is now called collection1 - added solr_40/webgraph subdirectory as second core - added a servlet configuration for the second core 'webgraph' in /IndexSchema_p.html - added instance handling as addition to solr connections: all solr connectors are now instances of an solr 'instance' object; this required a complete re-design of the solr embedding - migrated also caching and sharding ontop of new instance handling - migrated the search apis to handle now the access to a specific core, the default core named 'collection1' - migrated the remote solr search interface to access shards of cores; for the yacy remote search the default core is now called 'solr'; using the peer address as solr address - migrated the solr backup and restore process: old backups cannot be used after this migration! - redesign of solr instance handling in all methods which access the instances: they cannot hold copies of these instances any more; the must retrieve the actuall connection object every time they want to write to it (this solves also some bugs when switching the index/network) - added another schema 'solr.webgraph.schema', the old solr.keys.list is replaced by solr.collection.schema
12 years ago
CollectionSchema.failtype_s.getSolrFieldName(),
CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
introduced a second core named 'webgraph'. This core will hold the link structure, but is not filled yet. To have the opportunity of a second core, multi-core functionality had to be implemented to the deep-embedded solr: - migrated the solr_40 directory content to a subdirectory 'collection1'; the previously used default core is now called collection1 - added solr_40/webgraph subdirectory as second core - added a servlet configuration for the second core 'webgraph' in /IndexSchema_p.html - added instance handling as addition to solr connections: all solr connectors are now instances of an solr 'instance' object; this required a complete re-design of the solr embedding - migrated also caching and sharding ontop of new instance handling - migrated the search apis to handle now the access to a specific core, the default core named 'collection1' - migrated the remote solr search interface to access shards of cores; for the yacy remote search the default core is now called 'solr'; using the peer address as solr address - migrated the solr backup and restore process: old backups cannot be used after this migration! - redesign of solr instance handling in all methods which access the instances: they cannot hold copies of these instances any more; the must retrieve the actuall connection object every time they want to write to it (this solves also some bugs when switching the index/network) - added another schema 'solr.webgraph.schema', the old solr.keys.list is replaced by solr.collection.schema
12 years ago
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.crawldepth_i.getSolrFieldName(),
CollectionSchema.references_i.getSolrFieldName(),
CollectionSchema.references_internal_i.getSolrFieldName(),
CollectionSchema.references_external_i.getSolrFieldName(),
CollectionSchema.references_exthosts_i.getSolrFieldName()
));
solrQueryTask.start();
final Set<String> storedDocs = new HashSet<String>();
final Map<String, FailType> errorDocs = new HashMap<String, FailType>();
final Set<String> inboundLinks = new HashSet<String>();
final Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
final Map<String, InfoCacheEntry> infoCache = new HashMap<String, InfoCacheEntry>();
int hostsize = 0;
final List<String> deleteIDs = new ArrayList<String>();
final Collection<String> reloadURLs = new ArrayList<String>();
final Set<String> reloadURLCollection = new HashSet<String>();
final long timeoutList = System.currentTimeMillis() + TIMEOUT;
long remainingTime = TIMEOUT;
final long timeoutReferences = System.currentTimeMillis() + TIMEOUT;
final ReferenceReportCache rrCache = sb.index.getReferenceReportCache();
try {
SolrDocument doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS);
while (doc != AbstractSolrConnector.POISON_DOCUMENT && doc != null) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
final String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
final FailType error = errortype == null ? null : FailType.valueOf(errortype);
final String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences));
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ids);
} else {
if (error == null) storedDocs.add(u); else {
if (reload404 && error == FailType.fail) {
final ArrayList<String> collections = (ArrayList<String>) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName());
if (collections != null) reloadURLCollection.addAll(collections);
reloadURLs.add(u);
}
errorDocs.put(u, error);
}
}
} else if (complete) {
if (error == null) storedDocs.add(u); else {
errorDocs.put(u, error);
}
}
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link
if (error == null) {
hostsize++;
// collect inboundlinks to browse the host
Iterator<String> links = URIMetadataNode.getLinks(doc, true);
while (links.hasNext()) {
u = links.next();
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u);
}
// collect referrer links
links = URIMetadataNode.getLinks(doc, false);
while (links.hasNext()) {
u = links.next();
try {
final MultiProtocolURL mu = new MultiProtocolURL(u);
if (mu.getHost() != null) {
ReversibleScoreMap<String> lks = outboundHosts.get(mu.getHost());
if (lks == null) {
lks = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
outboundHosts.put(mu.getHost(), lks);
}
lks.set(u, u.length());
}
} catch (final MalformedURLException e) {}
}
}
remainingTime = timeoutList - System.currentTimeMillis();
if (remainingTime <= 0) {
break;
}
doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS);
}
} finally {
/* Ensure termination and proper resources release of the query thread */
solrQueryTask.interrupt();
}
if (deleteIDs.size() > 0) sb.remove(deleteIDs);
if (reloadURLs.size() > 0) {
final Map<String, Pattern> cm = new LinkedHashMap<String, Pattern>();
for (final String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern);
sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false);
}
// collect from crawler
final List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000);
final Set<String> loadingLinks = new HashSet<String>();
for (final Request crawlEntry: domainStackReferences) loadingLinks.add(crawlEntry.url().toNormalform(true));
// now combine all lists into one
final Map<String, StoreType> files = new HashMap<String, StoreType>();
for (final String u: storedDocs) files.put(u, StoreType.INDEX);
for (final Map.Entry<String, FailType> e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED);
for (final String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK);
if(!reload404) for (final String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK);
ConcurrentLog.info("IndexBrowser_p", "collected " + files.size() + " urls for path " + path);
// distinguish files and folders
final Map<String, Object> list = new TreeMap<String, Object>(); // a directory list; if object is boolean, its a file; if its a int[], then its a folder
final int pl = path.length();
String file;
for (final Map.Entry<String, StoreType> entry: files.entrySet()) {
if (entry.getKey().length() < pl) continue; // this is not inside the path
if (!entry.getKey().startsWith(path)) continue;
file = entry.getKey().substring(pl);
final StoreType type = entry.getValue();
p = file.indexOf('/');
if (p < 0) {
// this is a file
list.put(entry.getKey(), type); // StoreType value: this is a file; true -> file is in index; false -> not in index, maybe in crawler
} else {
// this is a directory path or a file in a subdirectory
final String remainingPath = file.substring(0, p + 1);
if (complete && remainingPath.indexOf('.') > 0) {
list.put(entry.getKey(), type); // StoreType value: this is a file
} else {
final String dir = path + remainingPath;
final Object c = list.get(dir);
if (c == null) {
final int[] linkedStoredIncrawlerError = new int[]{0,0,0,0,0};
if (type == StoreType.LINK) linkedStoredIncrawlerError[0]++;
if (type == StoreType.INDEX) linkedStoredIncrawlerError[1]++;
if (loadingLinks.contains(entry.getKey())) linkedStoredIncrawlerError[2]++;
if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++;
list.put(dir, linkedStoredIncrawlerError);
} else if (c instanceof int[]) {
if (type == StoreType.LINK) ((int[]) c)[0]++;
if (type == StoreType.INDEX) ((int[]) c)[1]++;
if (loadingLinks.contains(entry.getKey())) ((int[]) c)[2]++;
if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++;
}
}
}
}
int maxcount = 1000;
int c = 0;
// first list only folders
int filecounter = 0;
for (final Map.Entry<String, Object> entry: list.entrySet()) {
if ((entry.getValue() instanceof StoreType)) {
filecounter++;
} else {
// this is a folder
prop.put("files_list_" + c + "_type", 1);
prop.putHTML("files_list_" + c + "_type_url", entry.getKey());
final int linked = ((int[]) entry.getValue())[0];
final int stored = ((int[]) entry.getValue())[1];
final int crawler = ((int[]) entry.getValue())[2];
final int excl = ((int[]) entry.getValue())[3];
final int error = ((int[]) entry.getValue())[4];
prop.put("files_list_" + c + "_type_stored", stored);
prop.put("files_list_" + c + "_type_linked", linked);
prop.put("files_list_" + c + "_type_pendingVisible", crawler > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_pending", crawler);
prop.put("files_list_" + c + "_type_excludedVisible", excl > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_excluded", excl);
prop.put("files_list_" + c + "_type_failedVisible", error > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_failed", error);
if (++c >= maxcount) break;
}
}
// then list files
for (final Map.Entry<String, Object> entry: list.entrySet()) {
if (entry.getValue() instanceof StoreType) {
// this is a file
prop.put("files_list_" + c + "_type", 0);
11 years ago
prop.putHTML("files_list_" + c + "_type_url", entry.getKey());
final StoreType type = (StoreType) entry.getValue();
try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;}
final HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); // todo: cannot identify errors
final boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS);
final boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED;
final boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");
if (!dc) {
prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/);
if (type == StoreType.INDEX) {
final String ids = ASCII.String(uri.hash());
final InfoCacheEntry ice = infoCache.get(ids);
prop.put("files_list_" + c + "_type_stored_comment", ice == null ? "" : ice.toString()); // ice.toString() contains html, therefore do not use putHTML here
}
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
if (error) {
final FailType failType = errorDocs.get(entry.getKey());
if (failType == null) {
// maybe this is only in the errorURL
//Metadata faildoc = sb.index.fulltext().getDefaultConnector().getMetadata(ASCII.String(uri.hash()));
prop.putHTML("files_list_" + c + "_type_stored_error", "unknown error");
} else {
final String ids = ASCII.String(uri.hash());
final InfoCacheEntry ice = infoCache.get(ids);
prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail" + (ice == null ? "" : "; " + ice.toString()));
}
}
if (loadRight) {
11 years ago
prop.putHTML("files_list_" + c + "_type_stored_load_url", entry.getKey());
prop.putHTML("files_list_" + c + "_type_stored_load_path", path);
}
if (++c >= maxcount) break;
}
}
}
prop.put("files_list", c);
prop.putHTML("files_path", path);
prop.put("files_hostsize", hostsize);
prop.put("files_subpathloadsize", storedDocs.size());
prop.put("files_subpathdetectedsize", filecounter - storedDocs.size());
prop.put("files", 1);
uri = new DigestURL(path);
if (post.containsKey("showlinkstructure")) {
sb.setConfig(SwitchboardConstants.DECORATION_GRAFICS_LINKSTRUCTURE, true);
}
prop.put("files_linkgraph", uri.getPath().length() <= 1 && hostsize > 0 && sb.getConfigBool(SwitchboardConstants.DECORATION_GRAFICS_LINKSTRUCTURE, true));
prop.put("files_linkgraph_host", uri.getHost());
// generate inbound-links table
final StructureEntry struct = sb.webStructure.incomingReferences(hosthash);
if (struct != null && struct.references.size() > 0) {
maxcount = 200;
final ReversibleScoreMap<String> score = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
for (final Map.Entry<String, Integer> entry: struct.references.entrySet()) score.set(entry.getKey(), entry.getValue());
c = 0;
final Iterator<String> i = score.keys(false);
while (i.hasNext() && c < maxcount) {
host = i.next();
11 years ago
prop.putHTML("inbound_list_" + c + "_host", sb.webStructure.hostHash2hostName(host));
prop.put("inbound_list_" + c + "_count", score.get(host));
c++;
}
prop.put("inbound_list", c);
prop.put("inbound", 1);
} else {
prop.put("inbound", 0);
}
// generate outbound-links table
if (outboundHosts.size() > 0) {
maxcount = 200;
final ReversibleScoreMap<String> score = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
for (final Map.Entry<String, ReversibleScoreMap<String>> entry: outboundHosts.entrySet()) score.set(entry.getKey(), entry.getValue().size());
c = 0;
final Iterator<String> i = score.keys(false);
while (i.hasNext() && c < maxcount) {
host = i.next();
11 years ago
prop.putHTML("outbound_list_" + c + "_host", host);
prop.put("outbound_list_" + c + "_count", score.get(host));
prop.put("outbound_list_" + c + "_link", outboundHosts.get(host).getMinKey());
c++;
}
prop.put("outbound_list", c);
prop.put("outbound", 1);
} else {
prop.put("outbound", 0);
}
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
// for addtoblacklist button
if (post.containsKey("addtoblacklist") && sb != null) {
final File blacklistsPath = sb.getDataPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT);
final String blacklistname = Blacklist.defaultBlacklist(blacklistsPath);
if (blacklistname != null) {
String hoststr = pathURI.getHost();
if (hoststr.startsWith("www.")) { // to allow the automatic matching for www.host.com and host.com
hoststr = hoststr.substring(4);
}
final boolean isok = addBlacklistEntry(
blacklistname,
hoststr + "/.*"); // as yacy expands host pattern to match www.host.com as well as host.com no need to prefix with "*."
if (isok) {
prop.put("files_blkadded", 1); // glyphicon-ok on (display)
} else {
prop.put("files_blkadded", 2); // failure - glypicon-remove on
}
}
} else {
prop.put("files_blkadded", 0); // glyphicon-ok off
}
}
// return rewrite properties
prop.putNum("ucount", fulltext.collectionSize());
return prop;
}
public static final class InfoCacheEntry {
public Integer cr_n;
public Double cr_c;
public int crawldepth, references, references_internal, references_external, references_exthosts;
public List<String> references_internal_urls, references_external_urls;
public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, final boolean fetchReferences) {
this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
final Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
final Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
final Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
final Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
final Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
this.crawldepth = (cr == null || cr.intValue() < 0) ? 0 : cr.intValue(); // for lazy value storage; non-existent means: stored as '0'
this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
// calculate the url reference list
this.references_internal_urls = new ArrayList<String>();
this.references_external_urls = new ArrayList<String>();
if (fetchReferences) {
// get the references from the citation index
try {
final ReferenceReport rr = rrCache.getReferenceReport(urlhash, false);
final List<String> internalIDs = new ArrayList<String>();
final List<String> externalIDs = new ArrayList<String>();
final HandleSet iids = rr.getInternallIDs();
for (final byte[] b: iids) internalIDs.add(ASCII.String(b));
final HandleSet eids = rr.getExternalIDs();
for (final byte[] b: eids) externalIDs.add(ASCII.String(b));
// get all urls from the index and store them here
for (final String id: internalIDs) {
if (id.equals(urlhash)) continue; // no self-references
final String u = fulltext.getURL(id);
if (u != null) this.references_internal_urls.add(u);
}
for (final String id: externalIDs) {
if (id.equals(urlhash)) continue; // no self-references
final String u = fulltext.getURL(id);
if (u != null) this.references_external_urls.add(u);
}
} catch (final IOException e) {
}
}
this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue();
this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue();
}
@Override
public String toString() {
final StringBuilder sbi = new StringBuilder();
int c = 0;
for (final String s: this.references_internal_urls) {
sbi.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++;
if (c % 80 == 0) sbi.append("<br/>");
}
if (sbi.length() > 0) sbi.insert(0, "<br/>internal referrer:");
final StringBuilder sbe = new StringBuilder();
c = 0;
for (final String s: this.references_external_urls) {
sbe.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++;
if (c % 80 == 0) sbe.append("<br/>");
}
if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:");
return
(this.crawldepth == 998 ? "unknown crawldepth" : this.crawldepth >= 0 ? "crawldepth: " + this.crawldepth : "") +
(this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
(this.cr_n != null ? ", crn=" + this.cr_n : "") +
(this.references >= 0 ? ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : "");
}
}
}