merge rc1/master

pull/1/head
reger 11 years ago
parent 082c9a98c1
commit 1437c45383

@ -461,19 +461,21 @@
and old cache. and old cache.
--> -->
<filterCache class="solr.FastLRUCache" <filterCache class="solr.FastLRUCache"
size="512" size="64"
initialSize="512" initialSize="64"
autowarmCount="0"/> autowarmCount="4"
cleanupThread="true"/>
<!-- Query Result Cache <!-- Query Result Cache
Caches results of searches - ordered lists of document ids Caches results of searches - ordered lists of document ids
(DocList) based on a query, a sort, and the range of documents requested. (DocList) based on a query, a sort, and the range of documents requested.
--> -->
<queryResultCache class="solr.LRUCache" <queryResultCache class="solr.FastLRUCache"
size="512" size="64"
initialSize="512" initialSize="64"
autowarmCount="0"/> autowarmCount="4"
cleanupThread="true"/>
<!-- Document Cache <!-- Document Cache
@ -481,10 +483,11 @@
document). Since Lucene internal document ids are transient, document). Since Lucene internal document ids are transient,
this cache will not be autowarmed. this cache will not be autowarmed.
--> -->
<documentCache class="solr.LRUCache" <documentCache class="solr.FastLRUCache"
size="512" size="64"
initialSize="512" initialSize="64"
autowarmCount="0"/> autowarmCount="4"
cleanupThread="true"/>
<!-- Field Value Cache <!-- Field Value Cache
@ -494,9 +497,10 @@
--> -->
<!-- <!--
<fieldValueCache class="solr.FastLRUCache" <fieldValueCache class="solr.FastLRUCache"
size="512" size="64"
autowarmCount="128" autowarmCount="128"
showItems="32" /> showItems="32"
cleanupThread="true"/>
--> -->
<!-- Custom Cache <!-- Custom Cache
@ -510,11 +514,12 @@
--> -->
<!-- <!--
<cache name="myUserCache" <cache name="myUserCache"
class="solr.LRUCache" class="solr.FastLRUCache"
size="4096" size="64"
initialSize="1024" initialSize="64"
autowarmCount="1024" autowarmCount="64"
regenerator="com.mycompany.MyRegenerator" regenerator="com.mycompany.MyRegenerator"
cleanupThread="true"
/> />
--> -->

@ -797,11 +797,6 @@ search.excludehosth=
# the cases of nocache, iffresh and ifexist causes an index deletion # the cases of nocache, iffresh and ifexist causes an index deletion
search.verify.delete = true search.verify.delete = true
# images may be treated either as documents that are shown in search results or as objects
# that are only visible in special search environments, like image search
search.excludeintext.image = true
crawler.load.image = true
# remote search details # remote search details
remotesearch.maxcount = 10 remotesearch.maxcount = 10
remotesearch.maxtime = 3000 remotesearch.maxtime = 3000

@ -19,7 +19,7 @@
<dt><label for="HTCachePath">The path where the cache is stored</label></dt> <dt><label for="HTCachePath">The path where the cache is stored</label></dt>
<dd><input name="HTCachePath" id="HTCachePath" type="text" size="20" maxlength="300" value="#[HTCachePath]#" /></dd> <dd><input name="HTCachePath" id="HTCachePath" type="text" size="20" maxlength="300" value="#[HTCachePath]#" /></dd>
<dt><label for="actualCacheSize">The current size of the cache</label></dt> <dt><label for="actualCacheSize">The current size of the cache</label></dt>
<dd><span id="actualCacheSize">#[actualCacheSize]# MB</span></dd> <dd><span id="actualCacheSize">#[actualCacheSize]# MB for #[actualCacheDocCount]# files, #[docSizeAverage]# KB / file in average </span></dd>
<dt><label for="maxCacheSize">The maximum size of the cache</label></dt> <dt><label for="maxCacheSize">The maximum size of the cache</label></dt>
<dd><input name="maxCacheSize" id="maxCacheSize" type="text" size="8" maxlength="24" value="#[maxCacheSize]#" /> MB</dd> <dd><input name="maxCacheSize" id="maxCacheSize" type="text" size="8" maxlength="24" value="#[maxCacheSize]#" /> MB</dd>
<dt>&nbsp;</dt> <dt>&nbsp;</dt>

@ -77,7 +77,9 @@ public class ConfigHTCache_p {
} }
prop.put("HTCachePath", env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT)); prop.put("HTCachePath", env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT));
prop.put("actualCacheSize", (Cache.getActualCacheSize() / 1024 / 1024)); prop.put("actualCacheSize", Cache.getActualCacheSize() / 1024 / 1024);
prop.put("actualCacheDocCount", Cache.getActualCacheDocCount());
prop.put("docSizeAverage", Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024);
prop.put("maxCacheSize", env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64)); prop.put("maxCacheSize", env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64));
// return rewrite properties // return rewrite properties
return prop; return prop;

@ -34,7 +34,7 @@ public class ContentAnalysis_p {
// clean up all search events // clean up all search events
SearchEventCache.cleanupEvents(true); SearchEventCache.cleanupEvents(true);
sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
if (post != null && post.containsKey("EnterDoublecheck")) { if (post != null && post.containsKey("EnterDoublecheck")) {
Ranking.setMinTokenLen(post.getInt("minTokenLen", 3)); Ranking.setMinTokenLen(post.getInt("minTokenLen", 3));

@ -553,7 +553,6 @@ public class HostBrowser {
} }
} catch (final IOException e) { } catch (final IOException e) {
} }
} }
this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue(); this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue();
this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue(); this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue();
@ -562,7 +561,7 @@ public class HostBrowser {
StringBuilder sbi = new StringBuilder(); StringBuilder sbi = new StringBuilder();
int c = 0; int c = 0;
for (String s: references_internal_urls) { for (String s: references_internal_urls) {
sbi.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>"); sbi.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++; c++;
if (c % 80 == 0) sbi.append("<br/>"); if (c % 80 == 0) sbi.append("<br/>");
} }
@ -570,7 +569,7 @@ public class HostBrowser {
StringBuilder sbe = new StringBuilder(); StringBuilder sbe = new StringBuilder();
c = 0; c = 0;
for (String s: references_external_urls) { for (String s: references_external_urls) {
sbe.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>"); sbe.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++; c++;
if (c % 80 == 0) sbe.append("<br/>"); if (c % 80 == 0) sbe.append("<br/>");
} }

@ -193,6 +193,9 @@ function updatepage(str) {
<dt class="TableCellDark">URL Filter</dt> <dt class="TableCellDark">URL Filter</dt>
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" /> <dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
</dd> </dd>
<dt class="TableCellDark">query</dt>
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">Export Format</dt> <dt class="TableCellDark">Export Format</dt>
<dd>Only Domain: <dd>Only Domain:
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp; <input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;

@ -261,7 +261,8 @@ public class IndexControlURLs_p {
final File f = new File(s); final File f = new File(s);
f.getParentFile().mkdirs(); f.getParentFile().mkdirs();
final String filter = post.get("exportfilter", ".*"); final String filter = post.get("exportfilter", ".*");
final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom); final String query = post.get("exportquery", "*:*");
final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom);
prop.put("lurlexport_exportfile", s); prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count()); prop.put("lurlexport_urlcount", running.count());

@ -38,7 +38,7 @@ public class RankingSolr_p {
// clean up all search events // clean up all search events
SearchEventCache.cleanupEvents(true); SearchEventCache.cleanupEvents(true);
sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
int profileNr = 0; int profileNr = 0;
if (post != null) profileNr = post.getInt("profileNr", profileNr); if (post != null) profileNr = post.getInt("profileNr", profileNr);

@ -360,7 +360,7 @@ public class yacysearch {
// check available memory and clean up if necessary // check available memory and clean up if necessary
if ( !MemoryControl.request(8000000L, false) ) { if ( !MemoryControl.request(8000000L, false) ) {
indexSegment.clearCache(); indexSegment.clearCaches();
SearchEventCache.cleanupEvents(false); SearchEventCache.cleanupEvents(false);
} }

@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.document.parser.html.CharacterCoding;
/** /**
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@ -66,7 +67,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
public static final MultiProtocolURL POISON = new MultiProtocolURL(); // poison pill for concurrent link generators public static final MultiProtocolURL POISON = new MultiProtocolURL(); // poison pill for concurrent link generators
private static final Pattern ampPattern = Pattern.compile(Pattern.quote("&amp;"));
private static final long serialVersionUID = -1173233022912141884L; private static final long serialVersionUID = -1173233022912141884L;
private static final long SMB_TIMEOUT = 5000; private static final long SMB_TIMEOUT = 5000;
@ -636,7 +636,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
} else { } else {
this.searchpart = this.path.substring(r + 1); this.searchpart = this.path.substring(r + 1);
// strip &amp; // strip &amp;
Matcher matcher = ampPattern.matcher(this.searchpart); Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart);
while (matcher.find()) { while (matcher.find()) {
this.searchpart = matcher.replaceAll("&"); this.searchpart = matcher.replaceAll("&");
matcher.reset(this.searchpart); matcher.reset(this.searchpart);

@ -21,7 +21,6 @@
package net.yacy.cora.federate.solr.connector; package net.yacy.cora.federate.solr.connector;
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -235,7 +234,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
* @return a collection of a subset of the ids which exist in the index * @return a collection of a subset of the ids which exist in the index
* @throws IOException * @throws IOException
*/ */
public Set<String> existsByIds(Collection<String> ids) throws IOException { public Set<String> existsByIds(Set<String> ids) throws IOException {
if (ids == null || ids.size() == 0) return new HashSet<String>(); if (ids == null || ids.size() == 0) return new HashSet<String>();
// construct raw query // construct raw query
final SolrQuery params = new SolrQuery(); final SolrQuery params = new SolrQuery();

@ -61,7 +61,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
this.missCache = new ConcurrentARC<String, Object>(missCacheMax, partitions); this.missCache = new ConcurrentARC<String, Object>(missCacheMax, partitions);
} }
public void clearCache() { public void clearCaches() {
this.hitCache.clear(); this.hitCache.clear();
this.missCache.clear(); this.missCache.clear();
this.documentCache.clear(); this.documentCache.clear();
@ -70,9 +70,9 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
@Override @Override
public synchronized void close() { public synchronized void close() {
this.clearCaches();
if (this.solr != null) this.solr.close(); if (this.solr != null) this.solr.close();
this.solr = null; this.solr = null;
this.clearCache();
} }
/** /**
@ -81,7 +81,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
*/ */
@Override @Override
public void clear() throws IOException { public void clear() throws IOException {
this.clearCache(); this.clearCaches();
if (this.solr != null) this.solr.clear(); if (this.solr != null) this.solr.clear();
} }
@ -119,7 +119,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
@Override @Override
public void deleteByQuery(final String querystring) throws IOException { public void deleteByQuery(final String querystring) throws IOException {
this.clearCache(); this.clearCaches();
this.solr.deleteByQuery(querystring); this.solr.deleteByQuery(querystring);
} }
@ -261,7 +261,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
} }
private void addToCache(SolrDocumentList list, boolean doccache) { private void addToCache(SolrDocumentList list, boolean doccache) {
if (MemoryControl.shortStatus()) clearCache(); if (MemoryControl.shortStatus()) clearCaches();
for (final SolrDocument solrdoc: list) { for (final SolrDocument solrdoc: list) {
addToCache(solrdoc, doccache); addToCache(solrdoc, doccache);
} }

@ -118,6 +118,12 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
ensureAliveUpdateHandler(); ensureAliveUpdateHandler();
} }
@Override
public void clearCaches() {
this.connector.clearCaches();
this.idCache.clear();
}
/** /**
* used for debugging * used for debugging
*/ */
@ -326,10 +332,11 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
} }
@Override @Override
public Set<String> existsByIds(Collection<String> ids) throws IOException { public Set<String> existsByIds(Set<String> ids) throws IOException {
HashSet<String> e = new HashSet<String>(); HashSet<String> e = new HashSet<String>();
if (ids == null || ids.size() == 0) return e; if (ids == null || ids.size() == 0) return e;
Collection<String> idsC = new HashSet<String>(); if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : e;
Set<String> idsC = new HashSet<String>();
for (String id: ids) { for (String id: ids) {
if (this.idCache.has(ASCII.getBytes(id))) {cacheSuccessSign(); e.add(id); continue;} if (this.idCache.has(ASCII.getBytes(id))) {cacheSuccessSign(); e.add(id); continue;}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); continue;} if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); continue;}

@ -22,7 +22,6 @@
package net.yacy.cora.federate.solr.connector; package net.yacy.cora.federate.solr.connector;
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
@ -35,6 +34,7 @@ import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.search.Query;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
@ -48,10 +48,14 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.SearchHandler; import org.apache.solr.handler.component.SearchHandler;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryRequestBase; import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.request.UnInvertedField;
import org.apache.solr.response.ResultContext; import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList; import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QueryResultKey;
import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted; import org.apache.solr.util.RefCounted;
@ -89,6 +93,22 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
super.init(this.instance.getServer(coreName)); super.init(this.instance.getServer(coreName));
} }
public void clearCaches() {
SolrConfig solrConfig = this.core.getSolrConfig();
@SuppressWarnings("unchecked")
SolrCache<String, UnInvertedField> fieldValueCache = solrConfig.fieldValueCacheConfig == null ? null : solrConfig.fieldValueCacheConfig.newInstance();
if (fieldValueCache != null) fieldValueCache.clear();
@SuppressWarnings("unchecked")
SolrCache<Query, DocSet> filterCache= solrConfig.filterCacheConfig == null ? null : solrConfig.filterCacheConfig.newInstance();
if (filterCache != null) filterCache.clear();
@SuppressWarnings("unchecked")
SolrCache<QueryResultKey, DocList> queryResultCache = solrConfig.queryResultCacheConfig == null ? null : solrConfig.queryResultCacheConfig.newInstance();
if (queryResultCache != null) queryResultCache.clear();
@SuppressWarnings("unchecked")
SolrCache<Integer, Document> documentCache = solrConfig.documentCacheConfig == null ? null : solrConfig.documentCacheConfig.newInstance();
if (documentCache != null) documentCache.clear();
}
public SolrInstance getInstance() { public SolrInstance getInstance() {
return this.instance; return this.instance;
} }
@ -224,9 +244,9 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
} }
@Override @Override
public Set<String> existsByIds(Collection<String> ids) { public Set<String> existsByIds(Set<String> ids) {
if (ids == null || ids.size() == 0) return new HashSet<String>(); if (ids == null || ids.size() == 0) return new HashSet<String>();
if (ids.size() == 1 && ids instanceof Set) return existsById(ids.iterator().next()) ? (Set<String>) ids : new HashSet<String>(); if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet<String>();
StringBuilder sb = new StringBuilder(); // construct something like "({!raw f=id}Ij7B63g-gSHA) OR ({!raw f=id}PBcGI3g-gSHA)" StringBuilder sb = new StringBuilder(); // construct something like "({!raw f=id}Ij7B63g-gSHA) OR ({!raw f=id}PBcGI3g-gSHA)"
for (String id: ids) { for (String id: ids) {
sb.append("({!raw f=").append(CollectionSchema.id.getSolrFieldName()).append('}').append(id).append(") OR "); sb.append("({!raw f=").append(CollectionSchema.id.getSolrFieldName()).append('}').append(id).append(") OR ");

@ -53,6 +53,12 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
this.solr0 = solr0; this.solr0 = solr0;
this.solr1 = solr1; this.solr1 = solr1;
} }
@Override
public void clearCaches() {
if (this.solr0 != null) this.solr0.clearCaches();
if (this.solr1 != null) this.solr1.clearCaches();
}
public boolean isConnected0() { public boolean isConnected0() {
return this.solr0 != null; return this.solr0 != null;
@ -347,7 +353,9 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
} }
@Override @Override
public Set<String> existsByIds(Collection<String> ids) throws IOException { public Set<String> existsByIds(Set<String> ids) throws IOException {
if (ids == null || ids.size() == 0) return new HashSet<String>();
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet<String>();
if (this.solr0 != null && this.solr1 == null) return this.solr0.existsByIds(ids); if (this.solr0 != null && this.solr1 == null) return this.solr0.existsByIds(ids);
if (this.solr0 == null && this.solr1 != null) return this.solr1.existsByIds(ids); if (this.solr0 == null && this.solr1 != null) return this.solr1.existsByIds(ids);
Set<String> s = new HashSet<String>(); Set<String> s = new HashSet<String>();

@ -71,6 +71,11 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn
super.close(); super.close();
} }
@Override
public void clearCaches() {
// we do not have a direct access to the caches here, thus we simply do nothing.
}
@Override @Override
public QueryResponse getResponseByParams(ModifiableSolrParams params) throws IOException { public QueryResponse getResponseByParams(ModifiableSolrParams params) throws IOException {
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps // during the solr query we set the thread name to the query string to get more debugging info in thread dumps
@ -134,4 +139,5 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn
} }
System.exit(0); System.exit(0);
} }
} }

@ -36,7 +36,12 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ { public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ {
/**
* clear all caches: inside solr and ouside solr within the implementations of this interface
*/
public void clearCaches();
/** /**
* get the size of the index * get the size of the index
* @return number of results if solr is queries with a catch-all pattern * @return number of results if solr is queries with a catch-all pattern
@ -106,7 +111,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @return a collection of a subset of the ids which exist in the index * @return a collection of a subset of the ids which exist in the index
* @throws IOException * @throws IOException
*/ */
public Set<String> existsByIds(Collection<String> ids) throws IOException; public Set<String> existsByIds(Set<String> ids) throws IOException;
/** /**
* check if a given document exists in solr * check if a given document exists in solr

@ -64,7 +64,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
public SolrServer getServer() { public SolrServer getServer() {
return this.server; return this.server;
} }
@Override @Override
public void commit(final boolean softCommit) { public void commit(final boolean softCommit) {
synchronized (this.server) { synchronized (this.server) {

@ -24,7 +24,6 @@ import java.util.Collection;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.federate.solr.connector.CachedSolrConnector;
import net.yacy.cora.federate.solr.connector.ConcurrentUpdateSolrConnector; import net.yacy.cora.federate.solr.connector.ConcurrentUpdateSolrConnector;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.connector.MirrorSolrConnector; import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
@ -161,9 +160,9 @@ public class InstanceMirror {
return msc; return msc;
} }
public void clearCache() { public void clearCaches() {
for (SolrConnector csc: this.connectorCache.values()) { for (SolrConnector csc: this.connectorCache.values()) {
if (csc instanceof CachedSolrConnector) ((CachedSolrConnector) csc).clearCache(); csc.clearCaches();
} }
for (EmbeddedSolrConnector ssc: this.embeddedCache.values()) ssc.commit(true); for (EmbeddedSolrConnector ssc: this.embeddedCache.values()) ssc.commit(true);
} }

@ -1,195 +1,193 @@
/** /**
* HTMLResponseWriter * HTMLResponseWriter
* Copyright 2013 by Michael Peter Christen * Copyright 2013 by Michael Peter Christen
* First released 09.06.2013 at http://yacy.net * First released 09.06.2013 at http://yacy.net
* *
* This library is free software; you can redistribute it and/or * This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public * modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either * License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version. * version 2.1 of the License, or (at your option) any later version.
* *
* This library is distributed in the hope that it will be useful, * This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. * Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public License * You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt * along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
*/ */
package net.yacy.cora.federate.solr.responsewriter; package net.yacy.cora.federate.solr.responsewriter;
import java.io.IOException; import java.io.IOException;
import java.io.Writer; import java.io.Writer;
import java.util.Date; import java.util.Date;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.SolrType;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableField;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.XML; import org.apache.solr.common.util.XML;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.QueryResponseWriter; import org.apache.solr.response.QueryResponseWriter;
import org.apache.solr.response.ResultContext; import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType; import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField; import org.apache.solr.schema.TextField;
import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList; import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrIndexSearcher;
public class HTMLResponseWriter implements QueryResponseWriter { public class HTMLResponseWriter implements QueryResponseWriter {
private static final Set<String> DEFAULT_FIELD_LIST = null; private static final Set<String> DEFAULT_FIELD_LIST = null;
private static final Pattern dqp = Pattern.compile("\""); private static final Pattern dqp = Pattern.compile("\"");
public HTMLResponseWriter() { public HTMLResponseWriter() {
super(); super();
} }
@Override @Override
public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) { public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
return "text/html"; return "text/html";
} }
@Override @Override
public void init(@SuppressWarnings("rawtypes") NamedList n) { public void init(@SuppressWarnings("rawtypes") NamedList n) {
} }
@Override @Override
public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException { public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException {
NamedList<?> values = rsp.getValues(); NamedList<?> values = rsp.getValues();
assert values.get("responseHeader") != null; assert values.get("responseHeader") != null;
assert values.get("response") != null; assert values.get("response") != null;
writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"); writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n");
//writer.write("<!--\n"); //writer.write("<!--\n");
//writer.write("this is a XHTML+RDFa file. It contains RDF annotations with dublin core properties\n"); //writer.write("this is a XHTML+RDFa file. It contains RDF annotations with dublin core properties\n");
//writer.write("you can validate it with http://validator.w3.org/\n"); //writer.write("you can validate it with http://validator.w3.org/\n");
//writer.write("-->\n"); //writer.write("-->\n");
writer.write("<html xmlns=\"http://www.w3.org/1999/xhtml\"\n"); writer.write("<html xmlns=\"http://www.w3.org/1999/xhtml\"\n");
writer.write(" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"); writer.write(" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n");
writer.write(" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n"); writer.write(" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n");
writer.write(" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\">\n"); writer.write(" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\">\n");
writer.write("<head profile=\"http://www.w3.org/2003/g/data-view\">\n"); writer.write("<head profile=\"http://www.w3.org/2003/g/data-view\">\n");
//writer.write("<link rel=\"transformation\" href=\"http://www-sop.inria.fr/acacia/soft/RDFa2RDFXML.xsl\"/>\n"); //writer.write("<link rel=\"transformation\" href=\"http://www-sop.inria.fr/acacia/soft/RDFa2RDFXML.xsl\"/>\n");
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"all\" href=\"/env/base.css\" />\n"); writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"all\" href=\"/env/base.css\" />\n");
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"/env/style.css\" />\n"); writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"/env/style.css\" />\n");
NamedList<Object> paramsList = request.getOriginalParams().toNamedList(); NamedList<Object> paramsList = request.getOriginalParams().toNamedList();
paramsList.remove("wt"); paramsList.remove("wt");
String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</div>\n"); DocList response = ((ResultContext) values.get("response")).docs;
final int sz = response.size();
DocList response = ((ResultContext) values.get("response")).docs; if (sz > 0) {
final int sz = response.size(); SolrIndexSearcher searcher = request.getSearcher();
if (sz > 0) { DocIterator iterator = response.iterator();
SolrIndexSearcher searcher = request.getSearcher(); IndexSchema schema = request.getSchema();
DocIterator iterator = response.iterator();
IndexSchema schema = request.getSchema(); int id = iterator.nextDoc();
Document doc = searcher.doc(id, DEFAULT_FIELD_LIST);
int id = iterator.nextDoc(); LinkedHashMap<String, String> tdoc = translateDoc(schema, doc);
Document doc = searcher.doc(id, DEFAULT_FIELD_LIST);
LinkedHashMap<String, String> tdoc = translateDoc(schema, doc); String title = tdoc.get(CollectionSchema.title.getSolrFieldName());
if (sz == 1) {
String title = tdoc.get(CollectionSchema.title.getSolrFieldName()); writer.write("<title>" + title + "</title>\n</head><body>\n");
if (sz == 1) { } else {
writer.write("<title>" + title + "</title>\n</head><body>\n"); writer.write("<title>Document List</title>\n</head><body>\n");
} else { }
writer.write("<title>Document List</title>\n</head><body>\n"); writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
} writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</span></div>\n");
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see this page as XML.</span></div>\n"); writeDoc(writer, tdoc, title);
writeDoc(writer, tdoc, title); while (iterator.hasNext()) {
id = iterator.nextDoc();
while (iterator.hasNext()) { doc = searcher.doc(id, DEFAULT_FIELD_LIST);
id = iterator.nextDoc(); tdoc = translateDoc(schema, doc);
doc = searcher.doc(id, DEFAULT_FIELD_LIST); title = tdoc.get(CollectionSchema.title.getSolrFieldName());
tdoc = translateDoc(schema, doc); writeDoc(writer, tdoc, title);
title = tdoc.get(CollectionSchema.title.getSolrFieldName()); }
writeDoc(writer, tdoc, title); } else {
} writer.write("<title>No Document Found</title>\n</head><body>\n");
} else { }
writer.write("<title>No Document Found</title>\n</head><body>\n");
} writer.write("</body></html>\n");
}
writer.write("</body></html>\n");
} private static final void writeDoc(Writer writer, LinkedHashMap<String, String> tdoc, String title) throws IOException {
writer.write("<form name=\"yacydoc" + title + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
private static final void writeDoc(Writer writer, LinkedHashMap<String, String> tdoc, String title) throws IOException { writer.write("<fieldset>\n");
writer.write("<form name=\"yacydoc" + title + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n"); writer.write("<h1 property=\"dc:Title\">" + title + "</h1>\n");
writer.write("<fieldset>\n"); writer.write("<dl>\n");
writer.write("<h1 property=\"dc:Title\">" + title + "</h1>\n"); for (Map.Entry<String, String> entry: tdoc.entrySet()) {
writer.write("<dl>\n"); writer.write("<dt>");
for (Map.Entry<String, String> entry: tdoc.entrySet()) { writer.write(entry.getKey());
writer.write("<dt>"); writer.write("</dt><dd>");
writer.write(entry.getKey()); XML.escapeAttributeValue(entry.getValue(), writer);
writer.write("</dt><dd>"); writer.write("</dd>\n");
XML.escapeAttributeValue(entry.getValue(), writer); }
writer.write("</dd>\n"); writer.write("</dl>\n");
} writer.write("</fieldset>\n");
writer.write("</dl>\n"); writer.write("</form>\n");
writer.write("</fieldset>\n"); }
writer.write("</form>\n");
} static final LinkedHashMap<String, String> translateDoc(final IndexSchema schema, final Document doc) {
List<IndexableField> fields = doc.getFields();
static final LinkedHashMap<String, String> translateDoc(final IndexSchema schema, final Document doc) { int sz = fields.size();
List<IndexableField> fields = doc.getFields(); int fidx1 = 0, fidx2 = 0;
int sz = fields.size(); LinkedHashMap<String, String> kv = new LinkedHashMap<String, String>();
int fidx1 = 0, fidx2 = 0; while (fidx1 < sz) {
LinkedHashMap<String, String> kv = new LinkedHashMap<String, String>(); IndexableField value = fields.get(fidx1);
while (fidx1 < sz) { String fieldName = value.name();
IndexableField value = fields.get(fidx1); fidx2 = fidx1 + 1;
String fieldName = value.name(); while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) {
fidx2 = fidx1 + 1; fidx2++;
while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) { }
fidx2++; SchemaField sf = schema.getFieldOrNull(fieldName);
} if (sf == null) sf = new SchemaField(fieldName, new TextField());
SchemaField sf = schema.getFieldOrNull(fieldName); FieldType type = sf.getType();
if (sf == null) sf = new SchemaField(fieldName, new TextField());
FieldType type = sf.getType(); if (fidx1 + 1 == fidx2) {
if (sf.multiValued()) {
if (fidx1 + 1 == fidx2) { String sv = value.stringValue();
if (sf.multiValued()) { kv.put(fieldName, field2string(type, sv));
String sv = value.stringValue(); } else {
kv.put(fieldName, field2string(type, sv)); kv.put(fieldName, field2string(type, value.stringValue()));
} else { }
kv.put(fieldName, field2string(type, value.stringValue())); } else {
} for (int i = fidx1; i < fidx2; i++) {
} else { String sv = fields.get(i).stringValue();
for (int i = fidx1; i < fidx2; i++) { kv.put(fieldName + "_" + i, field2string(type, sv));
String sv = fields.get(i).stringValue(); }
kv.put(fieldName + "_" + i, field2string(type, sv)); }
}
} fidx1 = fidx2;
}
fidx1 = fidx2; return kv;
} }
return kv;
} @SuppressWarnings("deprecation")
private static String field2string(final FieldType type, final String value) {
@SuppressWarnings("deprecation") String typeName = type.getTypeName();
private static String field2string(final FieldType type, final String value) { if (typeName.equals(SolrType.bool.printName())) {
String typeName = type.getTypeName(); return "F".equals(value) ? "false" : "true";
if (typeName.equals(SolrType.bool.printName())) { } else if (typeName.equals(SolrType.date.printName())) {
return "F".equals(value) ? "false" : "true"; return org.apache.solr.schema.DateField.formatExternal(new Date(Long.parseLong(value))); // this is declared deprecated in solr 4.2.1 but is still used as done here
} else if (typeName.equals(SolrType.date.printName())) { }
return org.apache.solr.schema.DateField.formatExternal(new Date(Long.parseLong(value))); // this is declared deprecated in solr 4.2.1 but is still used as done here return value;
} }
return value;
} // XML.escapeCharData(val, writer);
}
// XML.escapeCharData(val, writer);
}

@ -55,6 +55,7 @@ import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
@ -347,17 +348,10 @@ public final class CrawlStacker {
// check availability of parser and maxfilesize // check availability of parser and maxfilesize
String warning = null; String warning = null;
boolean loadImages = Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
if (!loadImages && Switchboard.getSwitchboard().getConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, "").equals("true;")) {
// dammit semicolon
// TODO: remove this shit later
Switchboard.getSwitchboard().setConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
loadImages = true;
}
ContentDomain contentDomain = entry.url().getContentDomainFromExt(); ContentDomain contentDomain = entry.url().getContentDomainFromExt();
if ((maxFileSize >= 0 && entry.size() > maxFileSize) || if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
contentDomain == ContentDomain.APP || contentDomain == ContentDomain.APP ||
(!loadImages && contentDomain == ContentDomain.IMAGE) || (contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) ||
contentDomain == ContentDomain.AUDIO || contentDomain == ContentDomain.AUDIO ||
contentDomain == ContentDomain.VIDEO || contentDomain == ContentDomain.VIDEO ||
contentDomain == ContentDomain.CTRL) { contentDomain == ContentDomain.CTRL) {

@ -182,6 +182,14 @@ public final class Cache {
public static long getActualCacheSize() { public static long getActualCacheSize() {
return fileDBunbuffered.length(); return fileDBunbuffered.length();
} }
/**
* get the current actual cache size
* @return
*/
public static long getActualCacheDocCount() {
return fileDBunbuffered.size();
}
/** /**
* close the databases * close the databases

@ -41,7 +41,10 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.RowHandleSet;
public class BookmarksDB { public class BookmarksDB {
@ -147,11 +150,6 @@ public class BookmarksDB {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
public String addBookmark(final Bookmark bookmark){
saveBookmark(bookmark);
return bookmark.getUrlHash();
}
public Bookmark getBookmark(final String urlHash) throws IOException { public Bookmark getBookmark(final String urlHash) throws IOException {
try { try {
@ -214,18 +212,13 @@ public class BookmarksDB {
final TreeSet<String> set=new TreeSet<String>(new bookmarkComparator(true)); final TreeSet<String> set=new TreeSet<String>(new bookmarkComparator(true));
final String tagHash=BookmarkHelper.tagHash(tagName); final String tagHash=BookmarkHelper.tagHash(tagName);
final Tag tag=getTag(tagHash); final Tag tag=getTag(tagHash);
Set<String> hashes=new HashSet<String>(); RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes();
if (tag != null) {
hashes=getTag(tagHash).getUrlHashes();
}
if (priv) { if (priv) {
set.addAll(hashes); for (byte[] hash: hashes) set.add(ASCII.String(hash));
} else { } else {
final Iterator<String> it=hashes.iterator(); for (byte[] hash: hashes) {
Bookmark bm;
while(it.hasNext()){
try { try {
bm = getBookmark(it.next()); Bookmark bm = getBookmark(ASCII.String(hash));
if (bm != null && bm.getPublic()) { if (bm != null && bm.getPublic()) {
set.add(bm.getUrlHash()); set.add(bm.getUrlHash());
} }
@ -249,7 +242,7 @@ public class BookmarksDB {
* retrieve an object of type Tag from the the tagCache, if object is not cached return loadTag(hash) * retrieve an object of type Tag from the the tagCache, if object is not cached return loadTag(hash)
* @param hash an object of type String, containing a tagHash * @param hash an object of type String, containing a tagHash
*/ */
public Tag getTag(final String hash){ private Tag getTag(final String hash){
return this.tags.get(hash); //null if it does not exists return this.tags.get(hash); //null if it does not exists
} }
@ -257,7 +250,7 @@ public class BookmarksDB {
* store a Tag in tagsTable or remove an empty tag * store a Tag in tagsTable or remove an empty tag
* @param tag an object of type Tag to be stored/removed * @param tag an object of type Tag to be stored/removed
*/ */
public void putTag(final Tag tag){ private void putTag(final Tag tag){
if (tag == null) return; if (tag == null) return;
if (tag.isEmpty()) { if (tag.isEmpty()) {
this.tags.remove(tag.getTagHash()); this.tags.remove(tag.getTagHash());
@ -266,7 +259,7 @@ public class BookmarksDB {
} }
} }
public void removeTag(final String hash) { private void removeTag(final String hash) {
this.tags.remove(hash); this.tags.remove(hash);
} }
@ -301,7 +294,7 @@ public class BookmarksDB {
return set.iterator(); return set.iterator();
} }
public Iterator<Tag> getTagIterator(final String tagName, final boolean priv, final int comp) { private Iterator<Tag> getTagIterator(final String tagName, final boolean priv, final int comp) {
final TreeSet<Tag> set=new TreeSet<Tag>((comp == SORT_SIZE) ? tagSizeComparator : tagComparator); final TreeSet<Tag> set=new TreeSet<Tag>((comp == SORT_SIZE) ? tagSizeComparator : tagComparator);
Iterator<String> it=null; Iterator<String> it=null;
final Iterator<String> bit=getBookmarksIterator(tagName, priv); final Iterator<String> bit=getBookmarksIterator(tagName, priv);
@ -347,14 +340,14 @@ public class BookmarksDB {
final Tag oldTag=getTag(BookmarkHelper.tagHash(oldName)); final Tag oldTag=getTag(BookmarkHelper.tagHash(oldName));
if (oldTag != null) { if (oldTag != null) {
final Set<String> urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag final RowHandleSet urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag
removeTag(BookmarkHelper.tagHash(oldName)); // remove oldHash from TagsDB removeTag(BookmarkHelper.tagHash(oldName)); // remove oldHash from TagsDB
Bookmark bookmark; Bookmark bookmark;
Set<String> tagSet = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER); Set<String> tagSet = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
for (final String urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName for (final byte[] urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName
try { try {
bookmark = getBookmark(urlHash); bookmark = getBookmark(ASCII.String(urlHash));
tagSet = bookmark.getTags(); tagSet = bookmark.getTags();
tagSet.remove(oldName); tagSet.remove(oldName);
bookmark.setTags(tagSet, true); // might not be needed, but doesn't hurt bookmark.setTags(tagSet, true); // might not be needed, but doesn't hurt
@ -371,9 +364,9 @@ public class BookmarksDB {
public void addTag(final String selectTag, final String newTag) { public void addTag(final String selectTag, final String newTag) {
Bookmark bookmark; Bookmark bookmark;
for (final String urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag for (final byte[] urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag
try { try {
bookmark = getBookmark(urlHash); bookmark = getBookmark(ASCII.String(urlHash));
bookmark.addTag(newTag); bookmark.addTag(newTag);
saveBookmark(bookmark); saveBookmark(bookmark);
} catch (final IOException e) { } catch (final IOException e) {
@ -389,51 +382,24 @@ public class BookmarksDB {
* Subclass of bookmarksDB, which provides the Tag object-type * Subclass of bookmarksDB, which provides the Tag object-type
*/ */
public class Tag { public class Tag {
public static final String URL_HASHES = "urlHashes";
public static final String TAG_NAME = "tagName";
private final String tagHash; private final String tagHash;
private final Map<String, String> mem; private final String tagName;
private Set<String> urlHashes; private RowHandleSet urlHashes;
public Tag(final String hash, final Map<String, String> map){
this.tagHash = hash;
this.mem = map;
if (this.mem.containsKey(URL_HASHES)) {
this.urlHashes = ListManager.string2set(this.mem.get(URL_HASHES));
} else {
this.urlHashes = new HashSet<String>();
}
}
public Tag(final String name, final HashSet<String> entries){ private Tag(final String name) {
this.tagHash = BookmarkHelper.tagHash(name); this.tagHash = BookmarkHelper.tagHash(name);
this.mem = new HashMap<String, String>(); this.tagName = name;
//mem.put(URL_HASHES, listManager.arraylist2string(entries)); this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
this.urlHashes = entries;
this.mem.put(TAG_NAME, name);
}
public Tag(final String name){
this(name, new HashSet<String>());
}
public Map<String, String> getMap(){
this.mem.put(URL_HASHES, ListManager.collection2string(this.urlHashes));
return this.mem;
} }
/** /**
* get the lowercase Tagname * get the lowercase Tagname
*/ */
public String getTagName(){ public String getTagName(){
/*if(this.mem.containsKey(TAG_NAME)){
return (String) this.mem.get(TAG_NAME);
}
return "";*/
return getFriendlyName().toLowerCase(); return getFriendlyName().toLowerCase();
} }
public String getTagHash(){ private String getTagHash(){
return this.tagHash; return this.tagHash;
} }
@ -441,37 +407,33 @@ public class BookmarksDB {
* @return the tag name, with all uppercase chars * @return the tag name, with all uppercase chars
*/ */
public String getFriendlyName(){ public String getFriendlyName(){
/*if(this.mem.containsKey(TAG_FRIENDLY_NAME)){ return this.tagName;
return (String) this.mem.get(TAG_FRIENDLY_NAME);
}
return getTagName();*/
if(this.mem.containsKey(TAG_NAME)){
return this.mem.get(TAG_NAME);
}
return "notagname";
} }
public Set<String> getUrlHashes(){ private RowHandleSet getUrlHashes(){
return this.urlHashes; return this.urlHashes;
} }
public boolean hasPublicItems(){ private boolean hasPublicItems(){
return getBookmarksIterator(getTagName(), false).hasNext(); return getBookmarksIterator(getTagName(), false).hasNext();
} }
public void addUrl(final String urlHash){ private void addUrl(final String urlHash){
this.urlHashes.add(urlHash); try {
this.urlHashes.put(ASCII.getBytes(urlHash));
} catch (SpaceExceededException e) {
}
} }
public void delete(final String urlHash){ private void delete(final String urlHash){
this.urlHashes.remove(urlHash); this.urlHashes.remove(ASCII.getBytes(urlHash));
} }
public int size(){ public int size(){
return this.urlHashes.size(); return this.urlHashes.size();
} }
public boolean isEmpty() { private boolean isEmpty() {
return this.urlHashes.isEmpty(); return this.urlHashes.isEmpty();
} }
} }
@ -481,27 +443,19 @@ public class BookmarksDB {
*/ */
public class Bookmark { public class Bookmark {
public static final String BOOKMARK_URL = "bookmarkUrl"; private static final String BOOKMARK_URL = "bookmarkUrl";
public static final String BOOKMARK_TITLE = "bookmarkTitle"; public static final String BOOKMARK_TITLE = "bookmarkTitle";
public static final String BOOKMARK_DESCRIPTION = "bookmarkDesc"; public static final String BOOKMARK_DESCRIPTION = "bookmarkDesc";
public static final String BOOKMARK_TAGS = "bookmarkTags"; private static final String BOOKMARK_TAGS = "bookmarkTags";
public static final String BOOKMARK_PUBLIC = "bookmarkPublic"; private static final String BOOKMARK_PUBLIC = "bookmarkPublic";
public static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp"; private static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp";
public static final String BOOKMARK_OWNER = "bookmarkOwner"; private static final String BOOKMARK_OWNER = "bookmarkOwner";
public static final String BOOKMARK_IS_FEED = "bookmarkIsFeed"; private static final String BOOKMARK_IS_FEED = "bookmarkIsFeed";
private final String urlHash; private final String urlHash;
private Set<String> tagNames; private Set<String> tagNames;
private long timestamp; private long timestamp;
private final Map<String, String> entry; private final Map<String, String> entry;
public Bookmark(final String urlHash, final Map<String, String> map) {
this.entry = map;
this.urlHash = urlHash;
this.tagNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS)));
loadTimestamp();
}
public Bookmark(final DigestURL url) { public Bookmark(final DigestURL url) {
this.entry = new HashMap<String, String>(); this.entry = new HashMap<String, String>();
this.urlHash = ASCII.String(url.hash()); this.urlHash = ASCII.String(url.hash());
@ -529,11 +483,15 @@ public class BookmarksDB {
this(new DigestURL((url.indexOf("://") < 0) ? "http://" + url : url)); this(new DigestURL((url.indexOf("://") < 0) ? "http://" + url : url));
} }
public Bookmark(final Map<String, String> map) throws MalformedURLException { private Bookmark(final Map<String, String> map) throws MalformedURLException {
this(ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash()), map); this.entry = map;
this.urlHash = ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash());
this.tagNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS)));
loadTimestamp();
} }
Map<String, String> toMap() { private Map<String, String> toMap() {
this.entry.put(BOOKMARK_TAGS, ListManager.collection2string(this.tagNames)); this.entry.put(BOOKMARK_TAGS, ListManager.collection2string(this.tagNames));
this.entry.put(BOOKMARK_TIMESTAMP, String.valueOf(this.timestamp)); this.entry.put(BOOKMARK_TIMESTAMP, String.valueOf(this.timestamp));
return this.entry; return this.entry;
@ -688,11 +646,11 @@ public class BookmarksDB {
/** /**
* Subclass of bookmarksDB, which provides the bookmarkIterator object-type * Subclass of bookmarksDB, which provides the bookmarkIterator object-type
*/ */
public class bookmarkIterator implements Iterator<Bookmark> { private class bookmarkIterator implements Iterator<Bookmark> {
Iterator<byte[]> bookmarkIter; Iterator<byte[]> bookmarkIter;
public bookmarkIterator(final boolean up) throws IOException { private bookmarkIterator(final boolean up) throws IOException {
//flushBookmarkCache(); //XXX: this will cost performance //flushBookmarkCache(); //XXX: this will cost performance
this.bookmarkIter = BookmarksDB.this.bookmarks.keys(up, false); this.bookmarkIter = BookmarksDB.this.bookmarks.keys(up, false);
//this.nextEntry = null; //this.nextEntry = null;
@ -722,14 +680,14 @@ public class BookmarksDB {
/** /**
* Comparator to sort objects of type Bookmark according to their timestamps * Comparator to sort objects of type Bookmark according to their timestamps
*/ */
public class bookmarkComparator implements Comparator<String> { private class bookmarkComparator implements Comparator<String> {
private final boolean newestFirst; private final boolean newestFirst;
/** /**
* @param newestFirst newest first, or oldest first? * @param newestFirst newest first, or oldest first?
*/ */
public bookmarkComparator(final boolean newestFirst){ private bookmarkComparator(final boolean newestFirst){
this.newestFirst = newestFirst; this.newestFirst = newestFirst;
} }
@ -752,13 +710,13 @@ public class BookmarksDB {
} }
} }
public static final TagComparator tagComparator = new TagComparator(); private static final TagComparator tagComparator = new TagComparator();
public static final TagSizeComparator tagSizeComparator = new TagSizeComparator(); private static final TagSizeComparator tagSizeComparator = new TagSizeComparator();
/** /**
* Comparator to sort objects of type Tag according to their names * Comparator to sort objects of type Tag according to their names
*/ */
public static class TagComparator implements Comparator<Tag>, Serializable { private static class TagComparator implements Comparator<Tag>, Serializable {
/** /**
* generated serial * generated serial
@ -772,7 +730,7 @@ public class BookmarksDB {
} }
public static class TagSizeComparator implements Comparator<Tag>, Serializable { private static class TagSizeComparator implements Comparator<Tag>, Serializable {
/** /**
* generated serial * generated serial

@ -26,12 +26,15 @@ package net.yacy.document.parser.html;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern;
/** /**
* Contains methods to convert between Unicode and XML/HTML encoding. * Contains methods to convert between Unicode and XML/HTML encoding.
*/ */
public final class CharacterCoding { public final class CharacterCoding {
/** Ampersand pattern */
public final static Pattern ampPattern = Pattern.compile(Pattern.quote("&amp;"));
/** Ampersand character in unicode encoding. */ /** Ampersand character in unicode encoding. */
private static final char AMP_UNICODE = "\u0026".charAt(0); private static final char AMP_UNICODE = "\u0026".charAt(0);
/** Ampersand character in HTML encoding. */ /** Ampersand character in HTML encoding. */
@ -276,14 +279,15 @@ public final class CharacterCoding {
} }
return sb.toString(); return sb.toString();
} }
/** /**
* Replaces HTML-encoded characters with unicode representation. * Replaces HTML-encoded characters with unicode representation.
* @param text text with character to replace * @param text text with character to replace
* @return text with replaced characters * @return text with replaced characters
*/ */
public static String html2unicode(final String text) { public static String html2unicode(String text) {
if (text == null) return null; if (text == null) return null;
text = ampPattern.matcher(text).replaceAll("&"); // sometimes a double-replacement is necessary.
int p = 0, p1, q; int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length()); final StringBuilder sb = new StringBuilder(text.length());
String s; String s;

@ -204,11 +204,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
@Override @Override
public void scrapeText(final char[] newtext, final String insideTag) { public void scrapeText(final char[] newtext0, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext)); // System.out.println("SCRAPE: " + UTF8.String(newtext));
if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return; if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return;
int p, pl, q, s = 0; int p, pl, q, s = 0;
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
// match evaluation pattern // match evaluation pattern
this.evaluationScores.match(Element.text, newtext); this.evaluationScores.match(Element.text, newtext);
@ -466,7 +467,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) { public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) { if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
final String href = tagopts.getProperty("href", EMPTY_STRING); String href = tagopts.getProperty("href", EMPTY_STRING);
href = CharacterCoding.html2unicode(href);
AnchorURL url; AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); final String ext = MultiProtocolURL.getFileExtension(url.getFileName());

@ -32,27 +32,15 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.lang.reflect.Method;
import java.util.Date; import java.util.Date;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDMMType1Font;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont;
import org.apache.pdfbox.pdmodel.font.PDType1CFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
@ -222,25 +210,54 @@ public class pdfParser extends AbstractParser implements Parser {
false, false,
docDate)}; docDate)};
} }
@SuppressWarnings("static-access")
public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() { public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever // thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT! // AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
// Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain. // Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain.
PDFont.clearResources(); ResourceCleaner cl = new ResourceCleaner();
COSName.clearResources(); cl.clearClassResources("org.apache.pdfbox.cos.COSName");
PDType1Font.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont");
PDTrueTypeFont.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font");
PDType0Font.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont");
PDType1AfmPfbFont.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font");
PDType3Font.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont");
PDType1CFont.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font");
PDCIDFont.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont");
PDCIDFontType0Font.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont");
PDCIDFontType2Font.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font");
PDMMType1Font.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font");
PDSimpleFont.clearResources(); cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont");
}
@SuppressWarnings({ "unchecked", "rawtypes" })
private static class ResourceCleaner {
Method findLoadedClass;
private ClassLoader sys;
public ResourceCleaner() {
try {
this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class });
this.findLoadedClass.setAccessible(true);
this.sys = ClassLoader.getSystemClassLoader();
} catch (Throwable e) {
e.printStackTrace();
this.findLoadedClass = null;
this.sys = null;
}
}
public void clearClassResources(String name) {
if (this.findLoadedClass == null) return;
try {
Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name);
if (pdfparserpainclass != null) {
Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {});
if (clearResources != null) clearResources.invoke(null);
}
} catch (Throwable e) {
e.printStackTrace();
}
}
} }
/** /**

@ -37,12 +37,12 @@ public class CrashProtectionHandler extends HandlerWrapper implements Handler, H
} }
private void writeResponse(HttpServletRequest request, HttpServletResponse response, Exception exc) throws IOException { private void writeResponse(HttpServletRequest request, HttpServletResponse response, Exception exc) throws IOException {
PrintWriter out = response.getWriter(); PrintWriter out = response.getWriter();
out.println("Ops!"); out.println("Ops!");
out.println(); out.println();
out.println("Message: " + exc.getMessage()); out.println("Message: " + exc.getMessage());
exc.printStackTrace(out); exc.printStackTrace(out);
response.setContentType("text/plain"); response.setContentType("text/plain");
response.setStatus(500); response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
} }
} }

@ -91,7 +91,7 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
HttpServletResponse response) throws IOException, ServletException { HttpServletResponse response) throws IOException, ServletException {
RequestHeader proxyHeaders = convertHeaderFromJetty(request); RequestHeader proxyHeaders = convertHeaderFromJetty(request);
final String httpVer = (String) request.getHeader(HeaderFramework.CONNECTION_PROP_HTTP_VER); final String httpVer = request.getHeader(HeaderFramework.CONNECTION_PROP_HTTP_VER);
setViaHeader (proxyHeaders, httpVer); setViaHeader (proxyHeaders, httpVer);
proxyHeaders.remove(RequestHeader.KEEP_ALIVE); proxyHeaders.remove(RequestHeader.KEEP_ALIVE);
proxyHeaders.remove(RequestHeader.CONTENT_LENGTH); proxyHeaders.remove(RequestHeader.CONTENT_LENGTH);

@ -27,7 +27,6 @@ package net.yacy.http;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException; import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpServletResponse;

@ -97,11 +97,6 @@ public class TemplateHandler extends AbstractHandler implements Handler {
htDocsPath = Switchboard.getSwitchboard().htDocsPath.getPath(); htDocsPath = Switchboard.getSwitchboard().htDocsPath.getPath();
} }
@Override
protected void doStop() throws Exception {
super.doStop();
}
/** Returns a path to the localized or default file according to the parameter localeSelection /** Returns a path to the localized or default file according to the parameter localeSelection
* @param path relative from htroot * @param path relative from htroot
* @param localeSelection language of localized file; locale.language from switchboard is used if localeSelection.equals("") */ * @param localeSelection language of localized file; locale.language from switchboard is used if localeSelection.equals("") */

@ -17,13 +17,13 @@ import java.net.SocketException;
*/ */
public interface YaCyHttpServer { public interface YaCyHttpServer {
abstract public void startupServer() throws Exception; abstract void startupServer() throws Exception;
abstract public void stop() throws Exception; abstract void stop() throws Exception;
abstract public void setMaxSessionCount(int cnt); abstract void setMaxSessionCount(int cnt);
abstract public InetSocketAddress generateSocketAddress(String port) throws SocketException; abstract InetSocketAddress generateSocketAddress(String port) throws SocketException;
abstract public int getMaxSessionCount(); abstract int getMaxSessionCount();
abstract public int getJobCount(); abstract int getJobCount();
abstract public boolean withSSL(); abstract boolean withSSL();
abstract public void reconnect(int milsec); abstract void reconnect(int milsec);
abstract public String getVersion(); abstract String getVersion();
} }

@ -25,7 +25,6 @@
package net.yacy.peers; package net.yacy.peers;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -164,7 +163,7 @@ public class Transmission {
final ReferenceContainer<WordReference> c = (remaining >= container.size()) ? container : trimContainer(container, remaining); final ReferenceContainer<WordReference> c = (remaining >= container.size()) ? container : trimContainer(container, remaining);
// iterate through the entries in the container and check if the reference is in the repository // iterate through the entries in the container and check if the reference is in the repository
final List<byte[]> notFoundx = new ArrayList<byte[]>(); final List<byte[]> notFoundx = new ArrayList<byte[]>();
Collection<String> testids = new HashSet<String>(); Set<String> testids = new HashSet<String>();
Iterator<WordReference> i = c.entries(); Iterator<WordReference> i = c.entries();
while (i.hasNext()) { while (i.hasNext()) {
final WordReference e = i.next(); final WordReference e = i.next();

@ -129,7 +129,7 @@ public class ResourceObserver {
if(MemoryControl.properState()) return Space.HIGH; if(MemoryControl.properState()) return Space.HIGH;
// clear some caches - @all: are there more of these, we could clear here? // clear some caches - @all: are there more of these, we could clear here?
this.sb.index.clearCache(); this.sb.index.clearCaches();
SearchEventCache.cleanupEvents(true); SearchEventCache.cleanupEvents(true);
this.sb.trail.clear(); this.sb.trail.clear();
Switchboard.urlBlacklist.clearblacklistCache(); Switchboard.urlBlacklist.clearblacklistCache();

@ -1585,7 +1585,7 @@ public final class Switchboard extends serverSwitch {
* @param ids a collection of url hashes * @param ids a collection of url hashes
* @return a map from the hash id to: if it exists, the name of the database, otherwise null * @return a map from the hash id to: if it exists, the name of the database, otherwise null
*/ */
public Map<String, HarvestProcess> urlExists(final Collection<String> ids) { public Map<String, HarvestProcess> urlExists(final Set<String> ids) {
Set<String> e = this.index.exists(ids); Set<String> e = this.index.exists(ids);
Map<String, HarvestProcess> m = new HashMap<String, HarvestProcess>(); Map<String, HarvestProcess> m = new HashMap<String, HarvestProcess>();
for (String id: ids) { for (String id: ids) {
@ -2031,7 +2031,7 @@ public final class Switchboard extends serverSwitch {
// clear caches if necessary // clear caches if necessary
if ( !MemoryControl.request(128000000L, false) ) { if ( !MemoryControl.request(128000000L, false) ) {
this.index.clearCache(); this.index.clearCaches();
SearchEventCache.cleanupEvents(false); SearchEventCache.cleanupEvents(false);
this.trail.clear(); this.trail.clear();
GuiHandler.clear(); GuiHandler.clear();
@ -2556,12 +2556,16 @@ public final class Switchboard extends serverSwitch {
) { ) {
// get the hyperlinks // get the hyperlinks
final Map<DigestURL, String> hl = Document.getHyperlinks(documents); final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true); for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (loadImages) hl.putAll(Document.getImagelinks(documents)); if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue());
}
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) { if (response.profile().directDocByURL()) {
if (!loadImages) hl.putAll(Document.getImagelinks(documents)); for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue());
}
hl.putAll(Document.getApplinks(documents)); hl.putAll(Document.getApplinks(documents));
hl.putAll(Document.getVideolinks(documents)); hl.putAll(Document.getVideolinks(documents));
hl.putAll(Document.getAudiolinks(documents)); hl.putAll(Document.getAudiolinks(documents));
@ -2905,7 +2909,7 @@ public final class Switchboard extends serverSwitch {
// stacking may fail because of double occurrences of that url. Therefore // stacking may fail because of double occurrences of that url. Therefore
// we must wait here until the url has actually disappeared // we must wait here until the url has actually disappeared
int t = 100; int t = 100;
Collection<String> ids = new ArrayList<String>(1); ids.add(ASCII.String(urlhash)); Set<String> ids = new HashSet<String>(1); ids.add(ASCII.String(urlhash));
while (t-- > 0 && this.index.exists(ids).size() > 0) { while (t-- > 0 && this.index.exists(ids).size() > 0) {
try {Thread.sleep(100);} catch (final InterruptedException e) {} try {Thread.sleep(100);} catch (final InterruptedException e) {}
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t); ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);

@ -323,7 +323,6 @@ public final class SwitchboardConstants {
* <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p> * <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p> * <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
*/ */
public static final String CRAWLER_LOAD_IMAGE = "crawler.load.image";
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store

@ -225,10 +225,10 @@ public final class Fulltext {
} }
} }
public void clearCache() { public void clearCaches() {
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
if (this.statsDump != null) this.statsDump.clear(); if (this.statsDump != null) this.statsDump.clear();
this.solrInstances.clearCache(); this.solrInstances.clearCaches();
this.statsDump = null; this.statsDump = null;
} }
@ -250,7 +250,7 @@ public final class Fulltext {
for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear(); for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear();
} }
this.commit(false); this.commit(false);
this.solrInstances.clearCache(); this.solrInstances.clearCaches();
} }
} }
@ -260,7 +260,7 @@ public final class Fulltext {
if (instance != null) { if (instance != null) {
for (String name: instance.getCoreNames()) new RemoteSolrConnector(instance, name).clear(); for (String name: instance.getCoreNames()) new RemoteSolrConnector(instance, name).clear();
} }
this.solrInstances.clearCache(); this.solrInstances.clearCaches();
} }
} }
@ -400,7 +400,7 @@ public final class Fulltext {
throw new IOException(e.getMessage(), e); throw new IOException(e.getMessage(), e);
} }
this.statsDump = null; this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache(); if (MemoryControl.shortStatus()) clearCaches();
} }
public void putEdges(final Collection<SolrInputDocument> edges) throws IOException { public void putEdges(final Collection<SolrInputDocument> edges) throws IOException {
@ -412,7 +412,7 @@ public final class Fulltext {
throw new IOException(e.getMessage(), e); throw new IOException(e.getMessage(), e);
} }
this.statsDump = null; this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache(); if (MemoryControl.shortStatus()) clearCaches();
} }
/** /**
@ -432,7 +432,7 @@ public final class Fulltext {
throw new IOException(e.getMessage(), e); throw new IOException(e.getMessage(), e);
} }
this.statsDump = null; this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache(); if (MemoryControl.shortStatus()) clearCaches();
} }
/** /**
@ -617,10 +617,11 @@ public final class Fulltext {
* @param ids * @param ids
* @return a set of ids which exist in the database * @return a set of ids which exist in the database
*/ */
public Set<String> exists(Collection<String> ids) { public Set<String> exists(Set<String> ids) {
HashSet<String> e = new HashSet<String>(); HashSet<String> e = new HashSet<String>();
if (ids == null || ids.size() == 0) return e; if (ids == null || ids.size() == 0) return e;
Collection<String> idsC = new HashSet<String>(); if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e;
Set<String> idsC = new HashSet<String>();
idsC.addAll(ids); idsC.addAll(ids);
if (this.urlIndexFile != null) { if (this.urlIndexFile != null) {
Iterator<String> idsi = idsC.iterator(); Iterator<String> idsi = idsC.iterator();
@ -751,12 +752,12 @@ public final class Fulltext {
} }
// export methods // export methods
public Export export(final File f, final String filter, final int format, final boolean dom) { public Export export(final File f, final String filter, final String query, final int format, final boolean dom) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) { if ((this.exportthread != null) && (this.exportthread.isAlive())) {
ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
return this.exportthread; return this.exportthread;
} }
this.exportthread = new Export(f, filter, format, dom); this.exportthread = new Export(f, filter, query, format, dom);
this.exportthread.start(); this.exportthread.start();
return this.exportthread; return this.exportthread;
} }
@ -769,14 +770,15 @@ public final class Fulltext {
private final File f; private final File f;
private final Pattern pattern; private final Pattern pattern;
private int count; private int count;
private String failure; private String failure, query;
private final int format; private final int format;
private final boolean dom; private final boolean dom;
private Export(final File f, final String filter, final int format, boolean dom) { private Export(final File f, final String filter, final String query, final int format, boolean dom) {
// format: 0=text, 1=html, 2=rss/xml // format: 0=text, 1=html, 2=rss/xml
this.f = f; this.f = f;
this.pattern = filter == null ? null : Pattern.compile(filter); this.pattern = filter == null ? null : Pattern.compile(filter);
this.query = query == null? "*:*" : query;
this.count = 0; this.count = 0;
this.failure = null; this.failure = null;
this.format = format; this.format = format;
@ -805,7 +807,7 @@ public final class Fulltext {
if (this.dom) { if (this.dom) {
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
for (final String host: stats) { for (final String host: stats) {
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue; if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
@ -814,21 +816,19 @@ public final class Fulltext {
this.count++; this.count++;
} }
} else { } else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
SolrDocument doc; SolrDocument doc;
ArrayList<?> title; String url, hash, title, author, description;
String url, author, hash;
String[] descriptions;
Integer size; Integer size;
Date date; Date date;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
title = (ArrayList<?>) doc.getFieldValue(CollectionSchema.title.getSolrFieldName()); title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
author = (String) doc.getFieldValue(CollectionSchema.author.getSolrFieldName()); author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName()));
descriptions = (String[]) doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()); description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()));
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
@ -836,16 +836,14 @@ public final class Fulltext {
pw.println(url); pw.println(url);
} }
if (this.format == 1) { if (this.format == 1) {
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</a>"); if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
} }
if (this.format == 2) { if (this.format == 2) {
pw.println("<item>"); pw.println("<item>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>"); if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>"); pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>"); if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
if (descriptions != null && descriptions.length > 0) { if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
for (String d: descriptions) pw.println("<description>" + CharacterCoding.unicode2xml(d, true) + "</description>");
}
if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>"); if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>"); if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>"); pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
@ -883,6 +881,13 @@ public final class Fulltext {
public int count() { public int count() {
return this.count; return this.count;
} }
@SuppressWarnings("unchecked")
private String getStringFrom(final Object o) {
if (o == null) return "";
if (o instanceof ArrayList) return ((ArrayList<String>) o).get(0);
return (String) o;
}
} }

@ -29,7 +29,6 @@ package net.yacy.search.index;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -443,7 +442,7 @@ public class Segment {
* @param ids * @param ids
* @return a set of ids which exist in the database * @return a set of ids which exist in the database
*/ */
public Set<String> exists(final Collection<String> ids) { public Set<String> exists(final Set<String> ids) {
return this.fulltext.exists(ids); return this.fulltext.exists(ids);
} }
@ -504,10 +503,10 @@ public class Segment {
} }
} }
public void clearCache() { public void clearCaches() {
if (this.urlCitationIndex != null) this.urlCitationIndex.clearCache(); if (this.urlCitationIndex != null) this.urlCitationIndex.clearCache();
if (this.termIndex != null) this.termIndex.clearCache(); if (this.termIndex != null) this.termIndex.clearCache();
this.fulltext.clearCache(); this.fulltext.clearCaches();
} }
public File getLocation() { public File getLocation() {

@ -242,7 +242,8 @@ public class QueryGoal {
// add filter to prevent that results come from failed urls // add filter to prevent that results come from failed urls
q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND ("); q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND (");
q.append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).append(":[* TO *] OR "); q.append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).append(":[* TO *] OR ");
q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif))"); q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif) OR");
q.append(CollectionSchema.content_type.getSolrFieldName()).append(":(image/*))");
// parse special requests // parse special requests
if (isCatchall()) return q; if (isCatchall()) return q;

@ -898,17 +898,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(); CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString();
hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>(); ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts");
int countcheck = 0;
for (String host: hostscore.keyList(true)) { for (String host: hostscore.keyList(true)) {
// Patch the citation index for links with canonical tags. // Patch the citation index for links with canonical tags.
// This shall fulfill the following requirement: // This shall fulfill the following requirement:
// If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C. // If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C.
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links // To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]"; String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]";
long patchquerycount = collectionConnector.getCountByQuery(patchquery);
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 60000L, 50, BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 60000L, 50,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName()); CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
SolrDocument doc_B; SolrDocument doc_B;
int patchquerycountcheck = 0;
try { try {
while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// find all documents which link to the canonical doc // find all documents which link to the canonical doc
@ -926,10 +928,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
CitationReference doc_A_citation = doc_A_ids_iterator.next(); CitationReference doc_A_citation = doc_A_ids_iterator.next();
segment.urlCitation().add(doc_C_url.hash(), doc_A_citation); segment.urlCitation().add(doc_C_url.hash(), doc_A_citation);
} }
patchquerycountcheck++;
} }
} catch (InterruptedException e) { } catch (InterruptedException e) {
} catch (SpaceExceededException e) { } catch (SpaceExceededException e) {
} }
if (patchquerycount != patchquerycountcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous patchquery count for host " + host + ": expected=" + patchquerycount + ", counted=" + patchquerycountcheck);
// do the citation rank computation // do the citation rank computation
if (hostscore.get(host) <= 0) continue; if (hostscore.get(host) <= 0) continue;
@ -939,12 +943,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
while (convergence_attempts++ < 30) { while (convergence_attempts++ < 30) {
if (crh.convergenceStep()) break; if (crh.convergenceStep()) break;
} }
ConcurrentLog.info("CollectionConfiguration.CRHost", "convergence for host " + host + " after " + convergence_attempts + " steps"); ConcurrentLog.info("CollectionConfiguration", "convergence for host " + host + " after " + convergence_attempts + " steps");
// we have now the cr for all documents of a specific host; we store them for later use // we have now the cr for all documents of a specific host; we store them for later use
Map<byte[], CRV> crn = crh.normalize(); Map<byte[], CRV> crn = crh.normalize();
//crh.log(crn); //crh.log(crn);
ranking.putAll(crn); // accumulate this here for usage in document update later ranking.putAll(crn); // accumulate this here for usage in document update later
countcheck++;
} }
if (hostscore.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + hostscore.size() + ", counted=" + countcheck);
} catch (final IOException e2) { } catch (final IOException e2) {
hostscore = new ClusteredScoreMap<String>(); hostscore = new ClusteredScoreMap<String>();
} }
@ -952,13 +958,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// process all documents at the webgraph for the outgoing links of this document // process all documents at the webgraph for the outgoing links of this document
SolrDocument doc; SolrDocument doc;
if (webgraphConnector != null) { if (webgraphConnector != null) {
for (String host: hostscore.keyList(true)) { try {
if (hostscore.get(host) <= 0) continue; for (String host: hostscore.keyList(true)) {
// select all webgraph edges and modify their cr value if (hostscore.get(host) <= 0) continue;
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery( // select all webgraph edges and modify their cr value
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"", String query = WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"";
0, 10000000, 60000, 50); long count = webgraphConnector.getCountByQuery(query);
try { ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph");
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(query, 0, 10000000, 60000, 50);
int countcheck = 0;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
boolean changed = false; boolean changed = false;
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null); SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
@ -978,21 +986,29 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
webgraphConnector.add(sid); webgraphConnector.add(sid);
} catch (SolrException e) { } catch (SolrException e) {
} catch (IOException e) { } catch (IOException e) {
} }
countcheck++;
} }
} catch (final InterruptedException e) {} if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous webgraph document count for host " + host + ": expected=" + count + ", counted=" + countcheck);
}
} catch (final IOException e2) {
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
} catch (final InterruptedException e3) {
ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3);
} }
} }
// process all documents in collection // process all documents in collection
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery( String query = (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]";
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
0, 10000, 60000, 50);
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
Set<String> uniqueURLs = new HashSet<String>(); Set<String> uniqueURLs = new HashSet<String>();
try { try {
long count = collectionConnector.getCountByQuery(query);
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000, 60000, 50);
int countcheck = 0;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// for each to-be-processed entry work on the process tag // for each to-be-processed entry work on the process tag
Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName()); Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
@ -1031,8 +1047,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (!hostExtentCache.containsKey(hosthash)) { if (!hostExtentCache.containsKey(hosthash)) {
StringBuilder q = new StringBuilder(); StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
hostExtentCache.put(hosthash, count); hostExtentCache.put(hosthash, hostExtentCount);
} }
if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++;
@ -1047,13 +1063,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
proccount++; proccount++;
} catch (final Throwable e1) { } catch (final Throwable e1) {
} }
countcheck++;
} }
if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck);
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
proccount_clickdepthchange + " clickdepth changes, " + proccount_clickdepthchange + " clickdepth changes, " +
proccount_referencechange + " reference-count changes, " + proccount_referencechange + " reference-count changes, " +
proccount_uniquechange + " unique field changes, " + proccount_uniquechange + " unique field changes, " +
proccount_citationchange + " citation ranking changes."); proccount_citationchange + " citation ranking changes.");
} catch (final InterruptedException e) { } catch (final InterruptedException e2) {
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
} catch (IOException e3) {
ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3);
} }
return proccount; return proccount;
} }
@ -1148,8 +1169,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (entry == null || entry.getValue() == null) continue; if (entry == null || entry.getValue() == null) continue;
try { try {
String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName()); String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName());
ConcurrentLog.info("CollectionConfiguration.CRHost", "CR for " + url); ConcurrentLog.info("CollectionConfiguration", "CR for " + url);
ConcurrentLog.info("CollectionConfiguration.CRHost", ">> " + entry.getValue().toString()); ConcurrentLog.info("CollectionConfiguration", ">> " + entry.getValue().toString());
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }

Loading…
Cancel
Save