merge rc1/master

pull/1/head
reger 11 years ago
parent 082c9a98c1
commit 1437c45383

@ -461,19 +461,21 @@
and old cache.
-->
<filterCache class="solr.FastLRUCache"
size="512"
initialSize="512"
autowarmCount="0"/>
size="64"
initialSize="64"
autowarmCount="4"
cleanupThread="true"/>
<!-- Query Result Cache
Caches results of searches - ordered lists of document ids
(DocList) based on a query, a sort, and the range of documents requested.
-->
<queryResultCache class="solr.LRUCache"
size="512"
initialSize="512"
autowarmCount="0"/>
<queryResultCache class="solr.FastLRUCache"
size="64"
initialSize="64"
autowarmCount="4"
cleanupThread="true"/>
<!-- Document Cache
@ -481,10 +483,11 @@
document). Since Lucene internal document ids are transient,
this cache will not be autowarmed.
-->
<documentCache class="solr.LRUCache"
size="512"
initialSize="512"
autowarmCount="0"/>
<documentCache class="solr.FastLRUCache"
size="64"
initialSize="64"
autowarmCount="4"
cleanupThread="true"/>
<!-- Field Value Cache
@ -494,9 +497,10 @@
-->
<!--
<fieldValueCache class="solr.FastLRUCache"
size="512"
size="64"
autowarmCount="128"
showItems="32" />
showItems="32"
cleanupThread="true"/>
-->
<!-- Custom Cache
@ -510,11 +514,12 @@
-->
<!--
<cache name="myUserCache"
class="solr.LRUCache"
size="4096"
initialSize="1024"
autowarmCount="1024"
class="solr.FastLRUCache"
size="64"
initialSize="64"
autowarmCount="64"
regenerator="com.mycompany.MyRegenerator"
cleanupThread="true"
/>
-->

@ -797,11 +797,6 @@ search.excludehosth=
# the cases of nocache, iffresh and ifexist causes an index deletion
search.verify.delete = true
# images may be treated either as documents that are shown in search results or as objects
# that are only visible in special search environments, like image search
search.excludeintext.image = true
crawler.load.image = true
# remote search details
remotesearch.maxcount = 10
remotesearch.maxtime = 3000

@ -19,7 +19,7 @@
<dt><label for="HTCachePath">The path where the cache is stored</label></dt>
<dd><input name="HTCachePath" id="HTCachePath" type="text" size="20" maxlength="300" value="#[HTCachePath]#" /></dd>
<dt><label for="actualCacheSize">The current size of the cache</label></dt>
<dd><span id="actualCacheSize">#[actualCacheSize]# MB</span></dd>
<dd><span id="actualCacheSize">#[actualCacheSize]# MB for #[actualCacheDocCount]# files, #[docSizeAverage]# KB / file in average </span></dd>
<dt><label for="maxCacheSize">The maximum size of the cache</label></dt>
<dd><input name="maxCacheSize" id="maxCacheSize" type="text" size="8" maxlength="24" value="#[maxCacheSize]#" /> MB</dd>
<dt>&nbsp;</dt>

@ -77,7 +77,9 @@ public class ConfigHTCache_p {
}
prop.put("HTCachePath", env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT));
prop.put("actualCacheSize", (Cache.getActualCacheSize() / 1024 / 1024));
prop.put("actualCacheSize", Cache.getActualCacheSize() / 1024 / 1024);
prop.put("actualCacheDocCount", Cache.getActualCacheDocCount());
prop.put("docSizeAverage", Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024);
prop.put("maxCacheSize", env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64));
// return rewrite properties
return prop;

@ -34,7 +34,7 @@ public class ContentAnalysis_p {
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
if (post != null && post.containsKey("EnterDoublecheck")) {
Ranking.setMinTokenLen(post.getInt("minTokenLen", 3));

@ -553,7 +553,6 @@ public class HostBrowser {
}
} catch (final IOException e) {
}
}
this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue();
this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue();
@ -562,7 +561,7 @@ public class HostBrowser {
StringBuilder sbi = new StringBuilder();
int c = 0;
for (String s: references_internal_urls) {
sbi.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
sbi.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++;
if (c % 80 == 0) sbi.append("<br/>");
}
@ -570,7 +569,7 @@ public class HostBrowser {
StringBuilder sbe = new StringBuilder();
c = 0;
for (String s: references_external_urls) {
sbe.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
sbe.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++;
if (c % 80 == 0) sbe.append("<br/>");
}

@ -193,6 +193,9 @@ function updatepage(str) {
<dt class="TableCellDark">URL Filter</dt>
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">query</dt>
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">Export Format</dt>
<dd>Only Domain:
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;

@ -261,7 +261,8 @@ public class IndexControlURLs_p {
final File f = new File(s);
f.getParentFile().mkdirs();
final String filter = post.get("exportfilter", ".*");
final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom);
final String query = post.get("exportquery", "*:*");
final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom);
prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count());

@ -38,7 +38,7 @@ public class RankingSolr_p {
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
int profileNr = 0;
if (post != null) profileNr = post.getInt("profileNr", profileNr);

@ -360,7 +360,7 @@ public class yacysearch {
// check available memory and clean up if necessary
if ( !MemoryControl.request(8000000L, false) ) {
indexSegment.clearCache();
indexSegment.clearCaches();
SearchEventCache.cleanupEvents(false);
}

@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.parser.html.CharacterCoding;
/**
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@ -66,7 +67,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
public static final MultiProtocolURL POISON = new MultiProtocolURL(); // poison pill for concurrent link generators
private static final Pattern ampPattern = Pattern.compile(Pattern.quote("&amp;"));
private static final long serialVersionUID = -1173233022912141884L;
private static final long SMB_TIMEOUT = 5000;
@ -636,7 +636,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
} else {
this.searchpart = this.path.substring(r + 1);
// strip &amp;
Matcher matcher = ampPattern.matcher(this.searchpart);
Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart);
while (matcher.find()) {
this.searchpart = matcher.replaceAll("&");
matcher.reset(this.searchpart);

@ -21,7 +21,6 @@
package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -235,7 +234,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
* @return a collection of a subset of the ids which exist in the index
* @throws IOException
*/
public Set<String> existsByIds(Collection<String> ids) throws IOException {
public Set<String> existsByIds(Set<String> ids) throws IOException {
if (ids == null || ids.size() == 0) return new HashSet<String>();
// construct raw query
final SolrQuery params = new SolrQuery();

@ -61,7 +61,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
this.missCache = new ConcurrentARC<String, Object>(missCacheMax, partitions);
}
public void clearCache() {
public void clearCaches() {
this.hitCache.clear();
this.missCache.clear();
this.documentCache.clear();
@ -70,9 +70,9 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
@Override
public synchronized void close() {
this.clearCaches();
if (this.solr != null) this.solr.close();
this.solr = null;
this.clearCache();
}
/**
@ -81,7 +81,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
*/
@Override
public void clear() throws IOException {
this.clearCache();
this.clearCaches();
if (this.solr != null) this.solr.clear();
}
@ -119,7 +119,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
@Override
public void deleteByQuery(final String querystring) throws IOException {
this.clearCache();
this.clearCaches();
this.solr.deleteByQuery(querystring);
}
@ -261,7 +261,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
}
private void addToCache(SolrDocumentList list, boolean doccache) {
if (MemoryControl.shortStatus()) clearCache();
if (MemoryControl.shortStatus()) clearCaches();
for (final SolrDocument solrdoc: list) {
addToCache(solrdoc, doccache);
}

@ -118,6 +118,12 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
ensureAliveUpdateHandler();
}
@Override
public void clearCaches() {
this.connector.clearCaches();
this.idCache.clear();
}
/**
* used for debugging
*/
@ -326,10 +332,11 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
@Override
public Set<String> existsByIds(Collection<String> ids) throws IOException {
public Set<String> existsByIds(Set<String> ids) throws IOException {
HashSet<String> e = new HashSet<String>();
if (ids == null || ids.size() == 0) return e;
Collection<String> idsC = new HashSet<String>();
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : e;
Set<String> idsC = new HashSet<String>();
for (String id: ids) {
if (this.idCache.has(ASCII.getBytes(id))) {cacheSuccessSign(); e.add(id); continue;}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); continue;}

@ -22,7 +22,6 @@
package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
@ -35,6 +34,7 @@ import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.search.Query;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
@ -48,10 +48,14 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.SearchHandler;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.request.UnInvertedField;
import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QueryResultKey;
import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
@ -89,6 +93,22 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
super.init(this.instance.getServer(coreName));
}
public void clearCaches() {
SolrConfig solrConfig = this.core.getSolrConfig();
@SuppressWarnings("unchecked")
SolrCache<String, UnInvertedField> fieldValueCache = solrConfig.fieldValueCacheConfig == null ? null : solrConfig.fieldValueCacheConfig.newInstance();
if (fieldValueCache != null) fieldValueCache.clear();
@SuppressWarnings("unchecked")
SolrCache<Query, DocSet> filterCache= solrConfig.filterCacheConfig == null ? null : solrConfig.filterCacheConfig.newInstance();
if (filterCache != null) filterCache.clear();
@SuppressWarnings("unchecked")
SolrCache<QueryResultKey, DocList> queryResultCache = solrConfig.queryResultCacheConfig == null ? null : solrConfig.queryResultCacheConfig.newInstance();
if (queryResultCache != null) queryResultCache.clear();
@SuppressWarnings("unchecked")
SolrCache<Integer, Document> documentCache = solrConfig.documentCacheConfig == null ? null : solrConfig.documentCacheConfig.newInstance();
if (documentCache != null) documentCache.clear();
}
public SolrInstance getInstance() {
return this.instance;
}
@ -224,9 +244,9 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
}
@Override
public Set<String> existsByIds(Collection<String> ids) {
public Set<String> existsByIds(Set<String> ids) {
if (ids == null || ids.size() == 0) return new HashSet<String>();
if (ids.size() == 1 && ids instanceof Set) return existsById(ids.iterator().next()) ? (Set<String>) ids : new HashSet<String>();
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet<String>();
StringBuilder sb = new StringBuilder(); // construct something like "({!raw f=id}Ij7B63g-gSHA) OR ({!raw f=id}PBcGI3g-gSHA)"
for (String id: ids) {
sb.append("({!raw f=").append(CollectionSchema.id.getSolrFieldName()).append('}').append(id).append(") OR ");

@ -53,6 +53,12 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
this.solr0 = solr0;
this.solr1 = solr1;
}
@Override
public void clearCaches() {
if (this.solr0 != null) this.solr0.clearCaches();
if (this.solr1 != null) this.solr1.clearCaches();
}
public boolean isConnected0() {
return this.solr0 != null;
@ -347,7 +353,9 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
}
@Override
public Set<String> existsByIds(Collection<String> ids) throws IOException {
public Set<String> existsByIds(Set<String> ids) throws IOException {
if (ids == null || ids.size() == 0) return new HashSet<String>();
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet<String>();
if (this.solr0 != null && this.solr1 == null) return this.solr0.existsByIds(ids);
if (this.solr0 == null && this.solr1 != null) return this.solr1.existsByIds(ids);
Set<String> s = new HashSet<String>();

@ -71,6 +71,11 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn
super.close();
}
@Override
public void clearCaches() {
// we do not have a direct access to the caches here, thus we simply do nothing.
}
@Override
public QueryResponse getResponseByParams(ModifiableSolrParams params) throws IOException {
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps
@ -134,4 +139,5 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn
}
System.exit(0);
}
}

@ -36,7 +36,12 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ {
/**
* clear all caches: inside solr and ouside solr within the implementations of this interface
*/
public void clearCaches();
/**
* get the size of the index
* @return number of results if solr is queries with a catch-all pattern
@ -106,7 +111,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @return a collection of a subset of the ids which exist in the index
* @throws IOException
*/
public Set<String> existsByIds(Collection<String> ids) throws IOException;
public Set<String> existsByIds(Set<String> ids) throws IOException;
/**
* check if a given document exists in solr

@ -64,7 +64,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
public SolrServer getServer() {
return this.server;
}
@Override
public void commit(final boolean softCommit) {
synchronized (this.server) {

@ -24,7 +24,6 @@ import java.util.Collection;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.federate.solr.connector.CachedSolrConnector;
import net.yacy.cora.federate.solr.connector.ConcurrentUpdateSolrConnector;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
@ -161,9 +160,9 @@ public class InstanceMirror {
return msc;
}
public void clearCache() {
public void clearCaches() {
for (SolrConnector csc: this.connectorCache.values()) {
if (csc instanceof CachedSolrConnector) ((CachedSolrConnector) csc).clearCache();
csc.clearCaches();
}
for (EmbeddedSolrConnector ssc: this.embeddedCache.values()) ssc.commit(true);
}

@ -1,195 +1,193 @@
/**
* HTMLResponseWriter
* Copyright 2013 by Michael Peter Christen
* First released 09.06.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.responsewriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.XML;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.QueryResponseWriter;
import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
public class HTMLResponseWriter implements QueryResponseWriter {
private static final Set<String> DEFAULT_FIELD_LIST = null;
private static final Pattern dqp = Pattern.compile("\"");
public HTMLResponseWriter() {
super();
}
@Override
public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
return "text/html";
}
@Override
public void init(@SuppressWarnings("rawtypes") NamedList n) {
}
@Override
public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException {
NamedList<?> values = rsp.getValues();
assert values.get("responseHeader") != null;
assert values.get("response") != null;
writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n");
//writer.write("<!--\n");
//writer.write("this is a XHTML+RDFa file. It contains RDF annotations with dublin core properties\n");
//writer.write("you can validate it with http://validator.w3.org/\n");
//writer.write("-->\n");
writer.write("<html xmlns=\"http://www.w3.org/1999/xhtml\"\n");
writer.write(" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n");
writer.write(" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n");
writer.write(" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\">\n");
writer.write("<head profile=\"http://www.w3.org/2003/g/data-view\">\n");
//writer.write("<link rel=\"transformation\" href=\"http://www-sop.inria.fr/acacia/soft/RDFa2RDFXML.xsl\"/>\n");
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"all\" href=\"/env/base.css\" />\n");
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"/env/style.css\" />\n");
NamedList<Object> paramsList = request.getOriginalParams().toNamedList();
paramsList.remove("wt");
String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</div>\n");
DocList response = ((ResultContext) values.get("response")).docs;
final int sz = response.size();
if (sz > 0) {
SolrIndexSearcher searcher = request.getSearcher();
DocIterator iterator = response.iterator();
IndexSchema schema = request.getSchema();
int id = iterator.nextDoc();
Document doc = searcher.doc(id, DEFAULT_FIELD_LIST);
LinkedHashMap<String, String> tdoc = translateDoc(schema, doc);
String title = tdoc.get(CollectionSchema.title.getSolrFieldName());
if (sz == 1) {
writer.write("<title>" + title + "</title>\n</head><body>\n");
} else {
writer.write("<title>Document List</title>\n</head><body>\n");
}
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</span></div>\n");
writeDoc(writer, tdoc, title);
while (iterator.hasNext()) {
id = iterator.nextDoc();
doc = searcher.doc(id, DEFAULT_FIELD_LIST);
tdoc = translateDoc(schema, doc);
title = tdoc.get(CollectionSchema.title.getSolrFieldName());
writeDoc(writer, tdoc, title);
}
} else {
writer.write("<title>No Document Found</title>\n</head><body>\n");
}
writer.write("</body></html>\n");
}
private static final void writeDoc(Writer writer, LinkedHashMap<String, String> tdoc, String title) throws IOException {
writer.write("<form name=\"yacydoc" + title + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
writer.write("<fieldset>\n");
writer.write("<h1 property=\"dc:Title\">" + title + "</h1>\n");
writer.write("<dl>\n");
for (Map.Entry<String, String> entry: tdoc.entrySet()) {
writer.write("<dt>");
writer.write(entry.getKey());
writer.write("</dt><dd>");
XML.escapeAttributeValue(entry.getValue(), writer);
writer.write("</dd>\n");
}
writer.write("</dl>\n");
writer.write("</fieldset>\n");
writer.write("</form>\n");
}
static final LinkedHashMap<String, String> translateDoc(final IndexSchema schema, final Document doc) {
List<IndexableField> fields = doc.getFields();
int sz = fields.size();
int fidx1 = 0, fidx2 = 0;
LinkedHashMap<String, String> kv = new LinkedHashMap<String, String>();
while (fidx1 < sz) {
IndexableField value = fields.get(fidx1);
String fieldName = value.name();
fidx2 = fidx1 + 1;
while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) {
fidx2++;
}
SchemaField sf = schema.getFieldOrNull(fieldName);
if (sf == null) sf = new SchemaField(fieldName, new TextField());
FieldType type = sf.getType();
if (fidx1 + 1 == fidx2) {
if (sf.multiValued()) {
String sv = value.stringValue();
kv.put(fieldName, field2string(type, sv));
} else {
kv.put(fieldName, field2string(type, value.stringValue()));
}
} else {
for (int i = fidx1; i < fidx2; i++) {
String sv = fields.get(i).stringValue();
kv.put(fieldName + "_" + i, field2string(type, sv));
}
}
fidx1 = fidx2;
}
return kv;
}
@SuppressWarnings("deprecation")
private static String field2string(final FieldType type, final String value) {
String typeName = type.getTypeName();
if (typeName.equals(SolrType.bool.printName())) {
return "F".equals(value) ? "false" : "true";
} else if (typeName.equals(SolrType.date.printName())) {
return org.apache.solr.schema.DateField.formatExternal(new Date(Long.parseLong(value))); // this is declared deprecated in solr 4.2.1 but is still used as done here
}
return value;
}
// XML.escapeCharData(val, writer);
}
/**
* HTMLResponseWriter
* Copyright 2013 by Michael Peter Christen
* First released 09.06.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.responsewriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.XML;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.QueryResponseWriter;
import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
public class HTMLResponseWriter implements QueryResponseWriter {
private static final Set<String> DEFAULT_FIELD_LIST = null;
private static final Pattern dqp = Pattern.compile("\"");
public HTMLResponseWriter() {
super();
}
@Override
public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
return "text/html";
}
@Override
public void init(@SuppressWarnings("rawtypes") NamedList n) {
}
@Override
public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException {
NamedList<?> values = rsp.getValues();
assert values.get("responseHeader") != null;
assert values.get("response") != null;
writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n");
//writer.write("<!--\n");
//writer.write("this is a XHTML+RDFa file. It contains RDF annotations with dublin core properties\n");
//writer.write("you can validate it with http://validator.w3.org/\n");
//writer.write("-->\n");
writer.write("<html xmlns=\"http://www.w3.org/1999/xhtml\"\n");
writer.write(" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n");
writer.write(" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n");
writer.write(" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\">\n");
writer.write("<head profile=\"http://www.w3.org/2003/g/data-view\">\n");
//writer.write("<link rel=\"transformation\" href=\"http://www-sop.inria.fr/acacia/soft/RDFa2RDFXML.xsl\"/>\n");
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"all\" href=\"/env/base.css\" />\n");
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"/env/style.css\" />\n");
NamedList<Object> paramsList = request.getOriginalParams().toNamedList();
paramsList.remove("wt");
String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
DocList response = ((ResultContext) values.get("response")).docs;
final int sz = response.size();
if (sz > 0) {
SolrIndexSearcher searcher = request.getSearcher();
DocIterator iterator = response.iterator();
IndexSchema schema = request.getSchema();
int id = iterator.nextDoc();
Document doc = searcher.doc(id, DEFAULT_FIELD_LIST);
LinkedHashMap<String, String> tdoc = translateDoc(schema, doc);
String title = tdoc.get(CollectionSchema.title.getSolrFieldName());
if (sz == 1) {
writer.write("<title>" + title + "</title>\n</head><body>\n");
} else {
writer.write("<title>Document List</title>\n</head><body>\n");
}
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</span></div>\n");
writeDoc(writer, tdoc, title);
while (iterator.hasNext()) {
id = iterator.nextDoc();
doc = searcher.doc(id, DEFAULT_FIELD_LIST);
tdoc = translateDoc(schema, doc);
title = tdoc.get(CollectionSchema.title.getSolrFieldName());
writeDoc(writer, tdoc, title);
}
} else {
writer.write("<title>No Document Found</title>\n</head><body>\n");
}
writer.write("</body></html>\n");
}
private static final void writeDoc(Writer writer, LinkedHashMap<String, String> tdoc, String title) throws IOException {
writer.write("<form name=\"yacydoc" + title + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
writer.write("<fieldset>\n");
writer.write("<h1 property=\"dc:Title\">" + title + "</h1>\n");
writer.write("<dl>\n");
for (Map.Entry<String, String> entry: tdoc.entrySet()) {
writer.write("<dt>");
writer.write(entry.getKey());
writer.write("</dt><dd>");
XML.escapeAttributeValue(entry.getValue(), writer);
writer.write("</dd>\n");
}
writer.write("</dl>\n");
writer.write("</fieldset>\n");
writer.write("</form>\n");
}
static final LinkedHashMap<String, String> translateDoc(final IndexSchema schema, final Document doc) {
List<IndexableField> fields = doc.getFields();
int sz = fields.size();
int fidx1 = 0, fidx2 = 0;
LinkedHashMap<String, String> kv = new LinkedHashMap<String, String>();
while (fidx1 < sz) {
IndexableField value = fields.get(fidx1);
String fieldName = value.name();
fidx2 = fidx1 + 1;
while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) {
fidx2++;
}
SchemaField sf = schema.getFieldOrNull(fieldName);
if (sf == null) sf = new SchemaField(fieldName, new TextField());
FieldType type = sf.getType();
if (fidx1 + 1 == fidx2) {
if (sf.multiValued()) {
String sv = value.stringValue();
kv.put(fieldName, field2string(type, sv));
} else {
kv.put(fieldName, field2string(type, value.stringValue()));
}
} else {
for (int i = fidx1; i < fidx2; i++) {
String sv = fields.get(i).stringValue();
kv.put(fieldName + "_" + i, field2string(type, sv));
}
}
fidx1 = fidx2;
}
return kv;
}
@SuppressWarnings("deprecation")
private static String field2string(final FieldType type, final String value) {
String typeName = type.getTypeName();
if (typeName.equals(SolrType.bool.printName())) {
return "F".equals(value) ? "false" : "true";
} else if (typeName.equals(SolrType.date.printName())) {
return org.apache.solr.schema.DateField.formatExternal(new Date(Long.parseLong(value))); // this is declared deprecated in solr 4.2.1 but is still used as done here
}
return value;
}
// XML.escapeCharData(val, writer);
}

@ -55,6 +55,7 @@ import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.workflow.WorkflowProcessor;
@ -347,17 +348,10 @@ public final class CrawlStacker {
// check availability of parser and maxfilesize
String warning = null;
boolean loadImages = Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
if (!loadImages && Switchboard.getSwitchboard().getConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, "").equals("true;")) {
// dammit semicolon
// TODO: remove this shit later
Switchboard.getSwitchboard().setConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
loadImages = true;
}
ContentDomain contentDomain = entry.url().getContentDomainFromExt();
if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
contentDomain == ContentDomain.APP ||
(!loadImages && contentDomain == ContentDomain.IMAGE) ||
(contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) ||
contentDomain == ContentDomain.AUDIO ||
contentDomain == ContentDomain.VIDEO ||
contentDomain == ContentDomain.CTRL) {

@ -182,6 +182,14 @@ public final class Cache {
public static long getActualCacheSize() {
return fileDBunbuffered.length();
}
/**
* get the current actual cache size
* @return
*/
public static long getActualCacheDocCount() {
return fileDBunbuffered.size();
}
/**
* close the databases

@ -41,7 +41,10 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.RowHandleSet;
public class BookmarksDB {
@ -147,11 +150,6 @@ public class BookmarksDB {
ConcurrentLog.logException(e);
}
}
public String addBookmark(final Bookmark bookmark){
saveBookmark(bookmark);
return bookmark.getUrlHash();
}
public Bookmark getBookmark(final String urlHash) throws IOException {
try {
@ -214,18 +212,13 @@ public class BookmarksDB {
final TreeSet<String> set=new TreeSet<String>(new bookmarkComparator(true));
final String tagHash=BookmarkHelper.tagHash(tagName);
final Tag tag=getTag(tagHash);
Set<String> hashes=new HashSet<String>();
if (tag != null) {
hashes=getTag(tagHash).getUrlHashes();
}
RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes();
if (priv) {
set.addAll(hashes);
for (byte[] hash: hashes) set.add(ASCII.String(hash));
} else {
final Iterator<String> it=hashes.iterator();
Bookmark bm;
while(it.hasNext()){
for (byte[] hash: hashes) {
try {
bm = getBookmark(it.next());
Bookmark bm = getBookmark(ASCII.String(hash));
if (bm != null && bm.getPublic()) {
set.add(bm.getUrlHash());
}
@ -249,7 +242,7 @@ public class BookmarksDB {
* retrieve an object of type Tag from the the tagCache, if object is not cached return loadTag(hash)
* @param hash an object of type String, containing a tagHash
*/
public Tag getTag(final String hash){
private Tag getTag(final String hash){
return this.tags.get(hash); //null if it does not exists
}
@ -257,7 +250,7 @@ public class BookmarksDB {
* store a Tag in tagsTable or remove an empty tag
* @param tag an object of type Tag to be stored/removed
*/
public void putTag(final Tag tag){
private void putTag(final Tag tag){
if (tag == null) return;
if (tag.isEmpty()) {
this.tags.remove(tag.getTagHash());
@ -266,7 +259,7 @@ public class BookmarksDB {
}
}
public void removeTag(final String hash) {
private void removeTag(final String hash) {
this.tags.remove(hash);
}
@ -301,7 +294,7 @@ public class BookmarksDB {
return set.iterator();
}
public Iterator<Tag> getTagIterator(final String tagName, final boolean priv, final int comp) {
private Iterator<Tag> getTagIterator(final String tagName, final boolean priv, final int comp) {
final TreeSet<Tag> set=new TreeSet<Tag>((comp == SORT_SIZE) ? tagSizeComparator : tagComparator);
Iterator<String> it=null;
final Iterator<String> bit=getBookmarksIterator(tagName, priv);
@ -347,14 +340,14 @@ public class BookmarksDB {
final Tag oldTag=getTag(BookmarkHelper.tagHash(oldName));
if (oldTag != null) {
final Set<String> urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag
final RowHandleSet urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag
removeTag(BookmarkHelper.tagHash(oldName)); // remove oldHash from TagsDB
Bookmark bookmark;
Set<String> tagSet = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
for (final String urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName
for (final byte[] urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName
try {
bookmark = getBookmark(urlHash);
bookmark = getBookmark(ASCII.String(urlHash));
tagSet = bookmark.getTags();
tagSet.remove(oldName);
bookmark.setTags(tagSet, true); // might not be needed, but doesn't hurt
@ -371,9 +364,9 @@ public class BookmarksDB {
public void addTag(final String selectTag, final String newTag) {
Bookmark bookmark;
for (final String urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag
for (final byte[] urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag
try {
bookmark = getBookmark(urlHash);
bookmark = getBookmark(ASCII.String(urlHash));
bookmark.addTag(newTag);
saveBookmark(bookmark);
} catch (final IOException e) {
@ -389,51 +382,24 @@ public class BookmarksDB {
* Subclass of bookmarksDB, which provides the Tag object-type
*/
public class Tag {
public static final String URL_HASHES = "urlHashes";
public static final String TAG_NAME = "tagName";
private final String tagHash;
private final Map<String, String> mem;
private Set<String> urlHashes;
public Tag(final String hash, final Map<String, String> map){
this.tagHash = hash;
this.mem = map;
if (this.mem.containsKey(URL_HASHES)) {
this.urlHashes = ListManager.string2set(this.mem.get(URL_HASHES));
} else {
this.urlHashes = new HashSet<String>();
}
}
private final String tagName;
private RowHandleSet urlHashes;
public Tag(final String name, final HashSet<String> entries){
private Tag(final String name) {
this.tagHash = BookmarkHelper.tagHash(name);
this.mem = new HashMap<String, String>();
//mem.put(URL_HASHES, listManager.arraylist2string(entries));
this.urlHashes = entries;
this.mem.put(TAG_NAME, name);
}
public Tag(final String name){
this(name, new HashSet<String>());
}
public Map<String, String> getMap(){
this.mem.put(URL_HASHES, ListManager.collection2string(this.urlHashes));
return this.mem;
this.tagName = name;
this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
}
/**
* get the lowercase Tagname
*/
public String getTagName(){
/*if(this.mem.containsKey(TAG_NAME)){
return (String) this.mem.get(TAG_NAME);
}
return "";*/
return getFriendlyName().toLowerCase();
}
public String getTagHash(){
private String getTagHash(){
return this.tagHash;
}
@ -441,37 +407,33 @@ public class BookmarksDB {
* @return the tag name, with all uppercase chars
*/
public String getFriendlyName(){
/*if(this.mem.containsKey(TAG_FRIENDLY_NAME)){
return (String) this.mem.get(TAG_FRIENDLY_NAME);
}
return getTagName();*/
if(this.mem.containsKey(TAG_NAME)){
return this.mem.get(TAG_NAME);
}
return "notagname";
return this.tagName;
}
public Set<String> getUrlHashes(){
private RowHandleSet getUrlHashes(){
return this.urlHashes;
}
public boolean hasPublicItems(){
private boolean hasPublicItems(){
return getBookmarksIterator(getTagName(), false).hasNext();
}
public void addUrl(final String urlHash){
this.urlHashes.add(urlHash);
private void addUrl(final String urlHash){
try {
this.urlHashes.put(ASCII.getBytes(urlHash));
} catch (SpaceExceededException e) {
}
}
public void delete(final String urlHash){
this.urlHashes.remove(urlHash);
private void delete(final String urlHash){
this.urlHashes.remove(ASCII.getBytes(urlHash));
}
public int size(){
return this.urlHashes.size();
}
public boolean isEmpty() {
private boolean isEmpty() {
return this.urlHashes.isEmpty();
}
}
@ -481,27 +443,19 @@ public class BookmarksDB {
*/
public class Bookmark {
public static final String BOOKMARK_URL = "bookmarkUrl";
private static final String BOOKMARK_URL = "bookmarkUrl";
public static final String BOOKMARK_TITLE = "bookmarkTitle";
public static final String BOOKMARK_DESCRIPTION = "bookmarkDesc";
public static final String BOOKMARK_TAGS = "bookmarkTags";
public static final String BOOKMARK_PUBLIC = "bookmarkPublic";
public static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp";
public static final String BOOKMARK_OWNER = "bookmarkOwner";
public static final String BOOKMARK_IS_FEED = "bookmarkIsFeed";
private static final String BOOKMARK_TAGS = "bookmarkTags";
private static final String BOOKMARK_PUBLIC = "bookmarkPublic";
private static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp";
private static final String BOOKMARK_OWNER = "bookmarkOwner";
private static final String BOOKMARK_IS_FEED = "bookmarkIsFeed";
private final String urlHash;
private Set<String> tagNames;
private long timestamp;
private final Map<String, String> entry;
public Bookmark(final String urlHash, final Map<String, String> map) {
this.entry = map;
this.urlHash = urlHash;
this.tagNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS)));
loadTimestamp();
}
public Bookmark(final DigestURL url) {
this.entry = new HashMap<String, String>();
this.urlHash = ASCII.String(url.hash());
@ -529,11 +483,15 @@ public class BookmarksDB {
this(new DigestURL((url.indexOf("://") < 0) ? "http://" + url : url));
}
public Bookmark(final Map<String, String> map) throws MalformedURLException {
this(ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash()), map);
private Bookmark(final Map<String, String> map) throws MalformedURLException {
this.entry = map;
this.urlHash = ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash());
this.tagNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS)));
loadTimestamp();
}
Map<String, String> toMap() {
private Map<String, String> toMap() {
this.entry.put(BOOKMARK_TAGS, ListManager.collection2string(this.tagNames));
this.entry.put(BOOKMARK_TIMESTAMP, String.valueOf(this.timestamp));
return this.entry;
@ -688,11 +646,11 @@ public class BookmarksDB {
/**
* Subclass of bookmarksDB, which provides the bookmarkIterator object-type
*/
public class bookmarkIterator implements Iterator<Bookmark> {
private class bookmarkIterator implements Iterator<Bookmark> {
Iterator<byte[]> bookmarkIter;
public bookmarkIterator(final boolean up) throws IOException {
private bookmarkIterator(final boolean up) throws IOException {
//flushBookmarkCache(); //XXX: this will cost performance
this.bookmarkIter = BookmarksDB.this.bookmarks.keys(up, false);
//this.nextEntry = null;
@ -722,14 +680,14 @@ public class BookmarksDB {
/**
* Comparator to sort objects of type Bookmark according to their timestamps
*/
public class bookmarkComparator implements Comparator<String> {
private class bookmarkComparator implements Comparator<String> {
private final boolean newestFirst;
/**
* @param newestFirst newest first, or oldest first?
*/
public bookmarkComparator(final boolean newestFirst){
private bookmarkComparator(final boolean newestFirst){
this.newestFirst = newestFirst;
}
@ -752,13 +710,13 @@ public class BookmarksDB {
}
}
public static final TagComparator tagComparator = new TagComparator();
public static final TagSizeComparator tagSizeComparator = new TagSizeComparator();
private static final TagComparator tagComparator = new TagComparator();
private static final TagSizeComparator tagSizeComparator = new TagSizeComparator();
/**
* Comparator to sort objects of type Tag according to their names
*/
public static class TagComparator implements Comparator<Tag>, Serializable {
private static class TagComparator implements Comparator<Tag>, Serializable {
/**
* generated serial
@ -772,7 +730,7 @@ public class BookmarksDB {
}
public static class TagSizeComparator implements Comparator<Tag>, Serializable {
private static class TagSizeComparator implements Comparator<Tag>, Serializable {
/**
* generated serial

@ -26,12 +26,15 @@ package net.yacy.document.parser.html;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
/**
* Contains methods to convert between Unicode and XML/HTML encoding.
*/
public final class CharacterCoding {
/** Ampersand pattern */
public final static Pattern ampPattern = Pattern.compile(Pattern.quote("&amp;"));
/** Ampersand character in unicode encoding. */
private static final char AMP_UNICODE = "\u0026".charAt(0);
/** Ampersand character in HTML encoding. */
@ -276,14 +279,15 @@ public final class CharacterCoding {
}
return sb.toString();
}
/**
* Replaces HTML-encoded characters with unicode representation.
* @param text text with character to replace
* @return text with replaced characters
*/
public static String html2unicode(final String text) {
public static String html2unicode(String text) {
if (text == null) return null;
text = ampPattern.matcher(text).replaceAll("&"); // sometimes a double-replacement is necessary.
int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length());
String s;

@ -204,11 +204,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
@Override
public void scrapeText(final char[] newtext, final String insideTag) {
public void scrapeText(final char[] newtext0, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return;
int p, pl, q, s = 0;
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
// match evaluation pattern
this.evaluationScores.match(Element.text, newtext);
@ -466,7 +467,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
final String href = tagopts.getProperty("href", EMPTY_STRING);
String href = tagopts.getProperty("href", EMPTY_STRING);
href = CharacterCoding.html2unicode(href);
AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());

@ -32,27 +32,15 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.util.Date;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDMMType1Font;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont;
import org.apache.pdfbox.pdmodel.font.PDType1CFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.id.AnchorURL;
@ -222,25 +210,54 @@ public class pdfParser extends AbstractParser implements Parser {
false,
docDate)};
}
@SuppressWarnings("static-access")
public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
// Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain.
PDFont.clearResources();
COSName.clearResources();
PDType1Font.clearResources();
PDTrueTypeFont.clearResources();
PDType0Font.clearResources();
PDType1AfmPfbFont.clearResources();
PDType3Font.clearResources();
PDType1CFont.clearResources();
PDCIDFont.clearResources();
PDCIDFontType0Font.clearResources();
PDCIDFontType2Font.clearResources();
PDMMType1Font.clearResources();
PDSimpleFont.clearResources();
ResourceCleaner cl = new ResourceCleaner();
cl.clearClassResources("org.apache.pdfbox.cos.COSName");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont");
}
@SuppressWarnings({ "unchecked", "rawtypes" })
private static class ResourceCleaner {
Method findLoadedClass;
private ClassLoader sys;
public ResourceCleaner() {
try {
this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class });
this.findLoadedClass.setAccessible(true);
this.sys = ClassLoader.getSystemClassLoader();
} catch (Throwable e) {
e.printStackTrace();
this.findLoadedClass = null;
this.sys = null;
}
}
public void clearClassResources(String name) {
if (this.findLoadedClass == null) return;
try {
Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name);
if (pdfparserpainclass != null) {
Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {});
if (clearResources != null) clearResources.invoke(null);
}
} catch (Throwable e) {
e.printStackTrace();
}
}
}
/**

@ -37,12 +37,12 @@ public class CrashProtectionHandler extends HandlerWrapper implements Handler, H
}
private void writeResponse(HttpServletRequest request, HttpServletResponse response, Exception exc) throws IOException {
PrintWriter out = response.getWriter();
out.println("Ops!");
out.println();
out.println("Message: " + exc.getMessage());
exc.printStackTrace(out);
response.setContentType("text/plain");
response.setStatus(500);
PrintWriter out = response.getWriter();
out.println("Ops!");
out.println();
out.println("Message: " + exc.getMessage());
exc.printStackTrace(out);
response.setContentType("text/plain");
response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
}
}

@ -91,7 +91,7 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
HttpServletResponse response) throws IOException, ServletException {
RequestHeader proxyHeaders = convertHeaderFromJetty(request);
final String httpVer = (String) request.getHeader(HeaderFramework.CONNECTION_PROP_HTTP_VER);
final String httpVer = request.getHeader(HeaderFramework.CONNECTION_PROP_HTTP_VER);
setViaHeader (proxyHeaders, httpVer);
proxyHeaders.remove(RequestHeader.KEEP_ALIVE);
proxyHeaders.remove(RequestHeader.CONTENT_LENGTH);

@ -27,7 +27,6 @@ package net.yacy.http;
import java.io.IOException;
import java.io.OutputStream;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

@ -97,11 +97,6 @@ public class TemplateHandler extends AbstractHandler implements Handler {
htDocsPath = Switchboard.getSwitchboard().htDocsPath.getPath();
}
@Override
protected void doStop() throws Exception {
super.doStop();
}
/** Returns a path to the localized or default file according to the parameter localeSelection
* @param path relative from htroot
* @param localeSelection language of localized file; locale.language from switchboard is used if localeSelection.equals("") */

@ -17,13 +17,13 @@ import java.net.SocketException;
*/
public interface YaCyHttpServer {
abstract public void startupServer() throws Exception;
abstract public void stop() throws Exception;
abstract public void setMaxSessionCount(int cnt);
abstract public InetSocketAddress generateSocketAddress(String port) throws SocketException;
abstract public int getMaxSessionCount();
abstract public int getJobCount();
abstract public boolean withSSL();
abstract public void reconnect(int milsec);
abstract public String getVersion();
abstract void startupServer() throws Exception;
abstract void stop() throws Exception;
abstract void setMaxSessionCount(int cnt);
abstract InetSocketAddress generateSocketAddress(String port) throws SocketException;
abstract int getMaxSessionCount();
abstract int getJobCount();
abstract boolean withSSL();
abstract void reconnect(int milsec);
abstract String getVersion();
}

@ -25,7 +25,6 @@
package net.yacy.peers;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@ -164,7 +163,7 @@ public class Transmission {
final ReferenceContainer<WordReference> c = (remaining >= container.size()) ? container : trimContainer(container, remaining);
// iterate through the entries in the container and check if the reference is in the repository
final List<byte[]> notFoundx = new ArrayList<byte[]>();
Collection<String> testids = new HashSet<String>();
Set<String> testids = new HashSet<String>();
Iterator<WordReference> i = c.entries();
while (i.hasNext()) {
final WordReference e = i.next();

@ -129,7 +129,7 @@ public class ResourceObserver {
if(MemoryControl.properState()) return Space.HIGH;
// clear some caches - @all: are there more of these, we could clear here?
this.sb.index.clearCache();
this.sb.index.clearCaches();
SearchEventCache.cleanupEvents(true);
this.sb.trail.clear();
Switchboard.urlBlacklist.clearblacklistCache();

@ -1585,7 +1585,7 @@ public final class Switchboard extends serverSwitch {
* @param ids a collection of url hashes
* @return a map from the hash id to: if it exists, the name of the database, otherwise null
*/
public Map<String, HarvestProcess> urlExists(final Collection<String> ids) {
public Map<String, HarvestProcess> urlExists(final Set<String> ids) {
Set<String> e = this.index.exists(ids);
Map<String, HarvestProcess> m = new HashMap<String, HarvestProcess>();
for (String id: ids) {
@ -2031,7 +2031,7 @@ public final class Switchboard extends serverSwitch {
// clear caches if necessary
if ( !MemoryControl.request(128000000L, false) ) {
this.index.clearCache();
this.index.clearCaches();
SearchEventCache.cleanupEvents(false);
this.trail.clear();
GuiHandler.clear();
@ -2556,12 +2556,16 @@ public final class Switchboard extends serverSwitch {
) {
// get the hyperlinks
final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
if (loadImages) hl.putAll(Document.getImagelinks(documents));
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue());
}
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
if (!loadImages) hl.putAll(Document.getImagelinks(documents));
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue());
}
hl.putAll(Document.getApplinks(documents));
hl.putAll(Document.getVideolinks(documents));
hl.putAll(Document.getAudiolinks(documents));
@ -2905,7 +2909,7 @@ public final class Switchboard extends serverSwitch {
// stacking may fail because of double occurrences of that url. Therefore
// we must wait here until the url has actually disappeared
int t = 100;
Collection<String> ids = new ArrayList<String>(1); ids.add(ASCII.String(urlhash));
Set<String> ids = new HashSet<String>(1); ids.add(ASCII.String(urlhash));
while (t-- > 0 && this.index.exists(ids).size() > 0) {
try {Thread.sleep(100);} catch (final InterruptedException e) {}
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);

@ -323,7 +323,6 @@ public final class SwitchboardConstants {
* <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
*/
public static final String CRAWLER_LOAD_IMAGE = "crawler.load.image";
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store

@ -225,10 +225,10 @@ public final class Fulltext {
}
}
public void clearCache() {
public void clearCaches() {
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
if (this.statsDump != null) this.statsDump.clear();
this.solrInstances.clearCache();
this.solrInstances.clearCaches();
this.statsDump = null;
}
@ -250,7 +250,7 @@ public final class Fulltext {
for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear();
}
this.commit(false);
this.solrInstances.clearCache();
this.solrInstances.clearCaches();
}
}
@ -260,7 +260,7 @@ public final class Fulltext {
if (instance != null) {
for (String name: instance.getCoreNames()) new RemoteSolrConnector(instance, name).clear();
}
this.solrInstances.clearCache();
this.solrInstances.clearCaches();
}
}
@ -400,7 +400,7 @@ public final class Fulltext {
throw new IOException(e.getMessage(), e);
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache();
if (MemoryControl.shortStatus()) clearCaches();
}
public void putEdges(final Collection<SolrInputDocument> edges) throws IOException {
@ -412,7 +412,7 @@ public final class Fulltext {
throw new IOException(e.getMessage(), e);
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache();
if (MemoryControl.shortStatus()) clearCaches();
}
/**
@ -432,7 +432,7 @@ public final class Fulltext {
throw new IOException(e.getMessage(), e);
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache();
if (MemoryControl.shortStatus()) clearCaches();
}
/**
@ -617,10 +617,11 @@ public final class Fulltext {
* @param ids
* @return a set of ids which exist in the database
*/
public Set<String> exists(Collection<String> ids) {
public Set<String> exists(Set<String> ids) {
HashSet<String> e = new HashSet<String>();
if (ids == null || ids.size() == 0) return e;
Collection<String> idsC = new HashSet<String>();
if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e;
Set<String> idsC = new HashSet<String>();
idsC.addAll(ids);
if (this.urlIndexFile != null) {
Iterator<String> idsi = idsC.iterator();
@ -751,12 +752,12 @@ public final class Fulltext {
}
// export methods
public Export export(final File f, final String filter, final int format, final boolean dom) {
public Export export(final File f, final String filter, final String query, final int format, final boolean dom) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) {
ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
return this.exportthread;
}
this.exportthread = new Export(f, filter, format, dom);
this.exportthread = new Export(f, filter, query, format, dom);
this.exportthread.start();
return this.exportthread;
}
@ -769,14 +770,15 @@ public final class Fulltext {
private final File f;
private final Pattern pattern;
private int count;
private String failure;
private String failure, query;
private final int format;
private final boolean dom;
private Export(final File f, final String filter, final int format, boolean dom) {
private Export(final File f, final String filter, final String query, final int format, boolean dom) {
// format: 0=text, 1=html, 2=rss/xml
this.f = f;
this.pattern = filter == null ? null : Pattern.compile(filter);
this.query = query == null? "*:*" : query;
this.count = 0;
this.failure = null;
this.format = format;
@ -805,7 +807,7 @@ public final class Fulltext {
if (this.dom) {
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
for (final String host: stats) {
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
@ -814,21 +816,19 @@ public final class Fulltext {
this.count++;
}
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
SolrDocument doc;
ArrayList<?> title;
String url, author, hash;
String[] descriptions;
String url, hash, title, author, description;
Integer size;
Date date;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
title = (ArrayList<?>) doc.getFieldValue(CollectionSchema.title.getSolrFieldName());
author = (String) doc.getFieldValue(CollectionSchema.author.getSolrFieldName());
descriptions = (String[]) doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName());
hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName()));
description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()));
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
@ -836,16 +836,14 @@ public final class Fulltext {
pw.println(url);
}
if (this.format == 1) {
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</a>");
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
}
if (this.format == 2) {
pw.println("<item>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
if (descriptions != null && descriptions.length > 0) {
for (String d: descriptions) pw.println("<description>" + CharacterCoding.unicode2xml(d, true) + "</description>");
}
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
@ -883,6 +881,13 @@ public final class Fulltext {
public int count() {
return this.count;
}
@SuppressWarnings("unchecked")
private String getStringFrom(final Object o) {
if (o == null) return "";
if (o instanceof ArrayList) return ((ArrayList<String>) o).get(0);
return (String) o;
}
}

@ -29,7 +29,6 @@ package net.yacy.search.index;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
@ -443,7 +442,7 @@ public class Segment {
* @param ids
* @return a set of ids which exist in the database
*/
public Set<String> exists(final Collection<String> ids) {
public Set<String> exists(final Set<String> ids) {
return this.fulltext.exists(ids);
}
@ -504,10 +503,10 @@ public class Segment {
}
}
public void clearCache() {
public void clearCaches() {
if (this.urlCitationIndex != null) this.urlCitationIndex.clearCache();
if (this.termIndex != null) this.termIndex.clearCache();
this.fulltext.clearCache();
this.fulltext.clearCaches();
}
public File getLocation() {

@ -242,7 +242,8 @@ public class QueryGoal {
// add filter to prevent that results come from failed urls
q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND (");
q.append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).append(":[* TO *] OR ");
q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif))");
q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif) OR");
q.append(CollectionSchema.content_type.getSolrFieldName()).append(":(image/*))");
// parse special requests
if (isCatchall()) return q;

@ -898,17 +898,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString();
hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts");
int countcheck = 0;
for (String host: hostscore.keyList(true)) {
// Patch the citation index for links with canonical tags.
// This shall fulfill the following requirement:
// If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C.
// If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C.
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]";
long patchquerycount = collectionConnector.getCountByQuery(patchquery);
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 60000L, 50,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
SolrDocument doc_B;
int patchquerycountcheck = 0;
try {
while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// find all documents which link to the canonical doc
@ -926,10 +928,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
CitationReference doc_A_citation = doc_A_ids_iterator.next();
segment.urlCitation().add(doc_C_url.hash(), doc_A_citation);
}
patchquerycountcheck++;
}
} catch (InterruptedException e) {
} catch (SpaceExceededException e) {
}
if (patchquerycount != patchquerycountcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous patchquery count for host " + host + ": expected=" + patchquerycount + ", counted=" + patchquerycountcheck);
// do the citation rank computation
if (hostscore.get(host) <= 0) continue;
@ -939,12 +943,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
while (convergence_attempts++ < 30) {
if (crh.convergenceStep()) break;
}
ConcurrentLog.info("CollectionConfiguration.CRHost", "convergence for host " + host + " after " + convergence_attempts + " steps");
ConcurrentLog.info("CollectionConfiguration", "convergence for host " + host + " after " + convergence_attempts + " steps");
// we have now the cr for all documents of a specific host; we store them for later use
Map<byte[], CRV> crn = crh.normalize();
//crh.log(crn);
ranking.putAll(crn); // accumulate this here for usage in document update later
countcheck++;
}
if (hostscore.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + hostscore.size() + ", counted=" + countcheck);
} catch (final IOException e2) {
hostscore = new ClusteredScoreMap<String>();
}
@ -952,13 +958,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// process all documents at the webgraph for the outgoing links of this document
SolrDocument doc;
if (webgraphConnector != null) {
for (String host: hostscore.keyList(true)) {
if (hostscore.get(host) <= 0) continue;
// select all webgraph edges and modify their cr value
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
0, 10000000, 60000, 50);
try {
try {
for (String host: hostscore.keyList(true)) {
if (hostscore.get(host) <= 0) continue;
// select all webgraph edges and modify their cr value
String query = WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"";
long count = webgraphConnector.getCountByQuery(query);
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph");
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(query, 0, 10000000, 60000, 50);
int countcheck = 0;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
boolean changed = false;
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
@ -978,21 +986,29 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
webgraphConnector.add(sid);
} catch (SolrException e) {
} catch (IOException e) {
}
}
countcheck++;
}
} catch (final InterruptedException e) {}
if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous webgraph document count for host " + host + ": expected=" + count + ", counted=" + countcheck);
}
} catch (final IOException e2) {
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
} catch (final InterruptedException e3) {
ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3);
}
}
// process all documents in collection
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
0, 10000, 60000, 50);
String query = (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]";
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
Set<String> uniqueURLs = new HashSet<String>();
try {
long count = collectionConnector.getCountByQuery(query);
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000, 60000, 50);
int countcheck = 0;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// for each to-be-processed entry work on the process tag
Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
@ -1031,8 +1047,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (!hostExtentCache.containsKey(hosthash)) {
StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
hostExtentCache.put(hosthash, count);
long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
hostExtentCache.put(hosthash, hostExtentCount);
}
if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++;
@ -1047,13 +1063,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
proccount++;
} catch (final Throwable e1) {
}
countcheck++;
}
if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck);
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
proccount_clickdepthchange + " clickdepth changes, " +
proccount_referencechange + " reference-count changes, " +
proccount_uniquechange + " unique field changes, " +
proccount_citationchange + " citation ranking changes.");
} catch (final InterruptedException e) {
} catch (final InterruptedException e2) {
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
} catch (IOException e3) {
ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3);
}
return proccount;
}
@ -1148,8 +1169,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (entry == null || entry.getValue() == null) continue;
try {
String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName());
ConcurrentLog.info("CollectionConfiguration.CRHost", "CR for " + url);
ConcurrentLog.info("CollectionConfiguration.CRHost", ">> " + entry.getValue().toString());
ConcurrentLog.info("CollectionConfiguration", "CR for " + url);
ConcurrentLog.info("CollectionConfiguration", ">> " + entry.getValue().toString());
} catch (final IOException e) {
ConcurrentLog.logException(e);
}

Loading…
Cancel
Save