From 1437c45383fdf97c12875b93d7d2c62102398e5f Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 7 Nov 2013 21:30:17 +0100 Subject: [PATCH] merge rc1/master --- defaults/solr/solrconfig.xml | 39 +- defaults/yacy.init | 5 - htroot/ConfigHTCache_p.html | 2 +- htroot/ConfigHTCache_p.java | 4 +- htroot/ContentAnalysis_p.java | 2 +- htroot/HostBrowser.java | 5 +- htroot/IndexControlURLs_p.html | 3 + htroot/IndexControlURLs_p.java | 3 +- htroot/RankingSolr_p.java | 2 +- htroot/yacysearch.java | 2 +- .../cora/document/id/MultiProtocolURL.java | 4 +- .../solr/connector/AbstractSolrConnector.java | 3 +- .../solr/connector/CachedSolrConnector.java | 10 +- .../ConcurrentUpdateSolrConnector.java | 11 +- .../solr/connector/EmbeddedSolrConnector.java | 26 +- .../solr/connector/MirrorSolrConnector.java | 10 +- .../solr/connector/RemoteSolrConnector.java | 6 + .../solr/connector/SolrConnector.java | 9 +- .../solr/connector/SolrServerConnector.java | 2 +- .../solr/instance/InstanceMirror.java | 5 +- .../responsewriter/HTMLResponseWriter.java | 388 +++++++++--------- source/net/yacy/crawler/CrawlStacker.java | 10 +- source/net/yacy/crawler/data/Cache.java | 8 + source/net/yacy/data/BookmarksDB.java | 150 +++---- .../document/parser/html/CharacterCoding.java | 8 +- .../document/parser/html/ContentScraper.java | 8 +- .../net/yacy/document/parser/pdfParser.java | 73 ++-- .../net/yacy/http/CrashProtectionHandler.java | 14 +- source/net/yacy/http/ProxyHandler.java | 2 +- source/net/yacy/http/SSIHandler.java | 1 - source/net/yacy/http/TemplateHandler.java | 5 - source/net/yacy/http/YaCyHttpServer.java | 18 +- source/net/yacy/peers/Transmission.java | 3 +- source/net/yacy/search/ResourceObserver.java | 2 +- source/net/yacy/search/Switchboard.java | 16 +- .../net/yacy/search/SwitchboardConstants.java | 1 - source/net/yacy/search/index/Fulltext.java | 61 +-- source/net/yacy/search/index/Segment.java | 7 +- source/net/yacy/search/query/QueryGoal.java | 3 +- .../schema/CollectionConfiguration.java | 65 ++- 40 files changed, 524 insertions(+), 472 deletions(-) diff --git a/defaults/solr/solrconfig.xml b/defaults/solr/solrconfig.xml index 1234dd0d3..d8044f969 100644 --- a/defaults/solr/solrconfig.xml +++ b/defaults/solr/solrconfig.xml @@ -461,19 +461,21 @@ and old cache. --> + size="64" + initialSize="64" + autowarmCount="4" + cleanupThread="true"/> - + - + diff --git a/defaults/yacy.init b/defaults/yacy.init index d23d09bae..b910a354e 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -797,11 +797,6 @@ search.excludehosth= # the cases of nocache, iffresh and ifexist causes an index deletion search.verify.delete = true -# images may be treated either as documents that are shown in search results or as objects -# that are only visible in special search environments, like image search -search.excludeintext.image = true -crawler.load.image = true - # remote search details remotesearch.maxcount = 10 remotesearch.maxtime = 3000 diff --git a/htroot/ConfigHTCache_p.html b/htroot/ConfigHTCache_p.html index ae8c851ce..890ae9845 100644 --- a/htroot/ConfigHTCache_p.html +++ b/htroot/ConfigHTCache_p.html @@ -19,7 +19,7 @@
-
#[actualCacheSize]# MB
+
#[actualCacheSize]# MB for #[actualCacheDocCount]# files, #[docSizeAverage]# KB / file in average
MB
 
diff --git a/htroot/ConfigHTCache_p.java b/htroot/ConfigHTCache_p.java index 73141e65a..48d4df623 100644 --- a/htroot/ConfigHTCache_p.java +++ b/htroot/ConfigHTCache_p.java @@ -77,7 +77,9 @@ public class ConfigHTCache_p { } prop.put("HTCachePath", env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT)); - prop.put("actualCacheSize", (Cache.getActualCacheSize() / 1024 / 1024)); + prop.put("actualCacheSize", Cache.getActualCacheSize() / 1024 / 1024); + prop.put("actualCacheDocCount", Cache.getActualCacheDocCount()); + prop.put("docSizeAverage", Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024); prop.put("maxCacheSize", env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64)); // return rewrite properties return prop; diff --git a/htroot/ContentAnalysis_p.java b/htroot/ContentAnalysis_p.java index 2ba573ab0..eed8455e7 100644 --- a/htroot/ContentAnalysis_p.java +++ b/htroot/ContentAnalysis_p.java @@ -34,7 +34,7 @@ public class ContentAnalysis_p { // clean up all search events SearchEventCache.cleanupEvents(true); - sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings + sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings if (post != null && post.containsKey("EnterDoublecheck")) { Ranking.setMinTokenLen(post.getInt("minTokenLen", 3)); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 833b77f00..8d53f191f 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -553,7 +553,6 @@ public class HostBrowser { } } catch (final IOException e) { } - } this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue(); this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue(); @@ -562,7 +561,7 @@ public class HostBrowser { StringBuilder sbi = new StringBuilder(); int c = 0; for (String s: references_internal_urls) { - sbi.append("info"); + sbi.append("info"); c++; if (c % 80 == 0) sbi.append("
"); } @@ -570,7 +569,7 @@ public class HostBrowser { StringBuilder sbe = new StringBuilder(); c = 0; for (String s: references_external_urls) { - sbe.append("info"); + sbe.append("info"); c++; if (c % 80 == 0) sbe.append("
"); } diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index b83d63754..6b96b1b17 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -193,6 +193,9 @@ function updatepage(str) {
URL Filter
+
query
+
+
Export Format
Only Domain: Plain Text List (domains only)   diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 94d46ba0e..48da0982c 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -261,7 +261,8 @@ public class IndexControlURLs_p { final File f = new File(s); f.getParentFile().mkdirs(); final String filter = post.get("exportfilter", ".*"); - final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom); + final String query = post.get("exportquery", "*:*"); + final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom); prop.put("lurlexport_exportfile", s); prop.put("lurlexport_urlcount", running.count()); diff --git a/htroot/RankingSolr_p.java b/htroot/RankingSolr_p.java index 04784f938..91e543a11 100644 --- a/htroot/RankingSolr_p.java +++ b/htroot/RankingSolr_p.java @@ -38,7 +38,7 @@ public class RankingSolr_p { // clean up all search events SearchEventCache.cleanupEvents(true); - sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings + sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings int profileNr = 0; if (post != null) profileNr = post.getInt("profileNr", profileNr); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 284a0b15e..b79c8061b 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -360,7 +360,7 @@ public class yacysearch { // check available memory and clean up if necessary if ( !MemoryControl.request(8000000L, false) ) { - indexSegment.clearCache(); + indexSegment.clearCaches(); SearchEventCache.cleanupEvents(false); } diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index de91810f7..c1b2000bf 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.CommonPattern; +import net.yacy.document.parser.html.CharacterCoding; /** * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file @@ -66,7 +67,6 @@ public class MultiProtocolURL implements Serializable, Comparable existsByIds(Collection ids) throws IOException { + public Set existsByIds(Set ids) throws IOException { if (ids == null || ids.size() == 0) return new HashSet(); // construct raw query final SolrQuery params = new SolrQuery(); diff --git a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java index c96fe2d33..eaf93603c 100644 --- a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java @@ -61,7 +61,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo this.missCache = new ConcurrentARC(missCacheMax, partitions); } - public void clearCache() { + public void clearCaches() { this.hitCache.clear(); this.missCache.clear(); this.documentCache.clear(); @@ -70,9 +70,9 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo @Override public synchronized void close() { + this.clearCaches(); if (this.solr != null) this.solr.close(); this.solr = null; - this.clearCache(); } /** @@ -81,7 +81,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo */ @Override public void clear() throws IOException { - this.clearCache(); + this.clearCaches(); if (this.solr != null) this.solr.clear(); } @@ -119,7 +119,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo @Override public void deleteByQuery(final String querystring) throws IOException { - this.clearCache(); + this.clearCaches(); this.solr.deleteByQuery(querystring); } @@ -261,7 +261,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo } private void addToCache(SolrDocumentList list, boolean doccache) { - if (MemoryControl.shortStatus()) clearCache(); + if (MemoryControl.shortStatus()) clearCaches(); for (final SolrDocument solrdoc: list) { addToCache(solrdoc, doccache); } diff --git a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java index 8eff5f315..ddbf550ec 100644 --- a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java @@ -118,6 +118,12 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { ensureAliveUpdateHandler(); } + @Override + public void clearCaches() { + this.connector.clearCaches(); + this.idCache.clear(); + } + /** * used for debugging */ @@ -326,10 +332,11 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } @Override - public Set existsByIds(Collection ids) throws IOException { + public Set existsByIds(Set ids) throws IOException { HashSet e = new HashSet(); if (ids == null || ids.size() == 0) return e; - Collection idsC = new HashSet(); + if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : e; + Set idsC = new HashSet(); for (String id: ids) { if (this.idCache.has(ASCII.getBytes(id))) {cacheSuccessSign(); e.add(id); continue;} if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); continue;} diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index 6348c79a3..10d36a9c9 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -22,7 +22,6 @@ package net.yacy.cora.federate.solr.connector; import java.io.IOException; -import java.util.Collection; import java.util.HashSet; import java.util.Set; import java.util.concurrent.BlockingQueue; @@ -35,6 +34,7 @@ import net.yacy.search.schema.CollectionSchema; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.search.Query; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.response.QueryResponse; @@ -48,10 +48,14 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.handler.component.SearchHandler; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequestBase; +import org.apache.solr.request.UnInvertedField; import org.apache.solr.response.ResultContext; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocList; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.QueryResultKey; +import org.apache.solr.search.SolrCache; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.RefCounted; @@ -89,6 +93,22 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo super.init(this.instance.getServer(coreName)); } + public void clearCaches() { + SolrConfig solrConfig = this.core.getSolrConfig(); + @SuppressWarnings("unchecked") + SolrCache fieldValueCache = solrConfig.fieldValueCacheConfig == null ? null : solrConfig.fieldValueCacheConfig.newInstance(); + if (fieldValueCache != null) fieldValueCache.clear(); + @SuppressWarnings("unchecked") + SolrCache filterCache= solrConfig.filterCacheConfig == null ? null : solrConfig.filterCacheConfig.newInstance(); + if (filterCache != null) filterCache.clear(); + @SuppressWarnings("unchecked") + SolrCache queryResultCache = solrConfig.queryResultCacheConfig == null ? null : solrConfig.queryResultCacheConfig.newInstance(); + if (queryResultCache != null) queryResultCache.clear(); + @SuppressWarnings("unchecked") + SolrCache documentCache = solrConfig.documentCacheConfig == null ? null : solrConfig.documentCacheConfig.newInstance(); + if (documentCache != null) documentCache.clear(); + } + public SolrInstance getInstance() { return this.instance; } @@ -224,9 +244,9 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo } @Override - public Set existsByIds(Collection ids) { + public Set existsByIds(Set ids) { if (ids == null || ids.size() == 0) return new HashSet(); - if (ids.size() == 1 && ids instanceof Set) return existsById(ids.iterator().next()) ? (Set) ids : new HashSet(); + if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet(); StringBuilder sb = new StringBuilder(); // construct something like "({!raw f=id}Ij7B63g-gSHA) OR ({!raw f=id}PBcGI3g-gSHA)" for (String id: ids) { sb.append("({!raw f=").append(CollectionSchema.id.getSolrFieldName()).append('}').append(id).append(") OR "); diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index 3f7a1453c..19fa604c5 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -53,6 +53,12 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo this.solr0 = solr0; this.solr1 = solr1; } + + @Override + public void clearCaches() { + if (this.solr0 != null) this.solr0.clearCaches(); + if (this.solr1 != null) this.solr1.clearCaches(); + } public boolean isConnected0() { return this.solr0 != null; @@ -347,7 +353,9 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public Set existsByIds(Collection ids) throws IOException { + public Set existsByIds(Set ids) throws IOException { + if (ids == null || ids.size() == 0) return new HashSet(); + if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet(); if (this.solr0 != null && this.solr1 == null) return this.solr0.existsByIds(ids); if (this.solr0 == null && this.solr1 != null) return this.solr1.existsByIds(ids); Set s = new HashSet(); diff --git a/source/net/yacy/cora/federate/solr/connector/RemoteSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/RemoteSolrConnector.java index 4e2a9369f..0ab5f8b31 100644 --- a/source/net/yacy/cora/federate/solr/connector/RemoteSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/RemoteSolrConnector.java @@ -71,6 +71,11 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn super.close(); } + @Override + public void clearCaches() { + // we do not have a direct access to the caches here, thus we simply do nothing. + } + @Override public QueryResponse getResponseByParams(ModifiableSolrParams params) throws IOException { // during the solr query we set the thread name to the query string to get more debugging info in thread dumps @@ -134,4 +139,5 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn } System.exit(0); } + } diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index e7a3dd957..f28d26f09 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -36,7 +36,12 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.ModifiableSolrParams; public interface SolrConnector extends Iterable /* Iterable of document IDs */ { - + + /** + * clear all caches: inside solr and ouside solr within the implementations of this interface + */ + public void clearCaches(); + /** * get the size of the index * @return number of results if solr is queries with a catch-all pattern @@ -106,7 +111,7 @@ public interface SolrConnector extends Iterable /* Iterable of document * @return a collection of a subset of the ids which exist in the index * @throws IOException */ - public Set existsByIds(Collection ids) throws IOException; + public Set existsByIds(Set ids) throws IOException; /** * check if a given document exists in solr diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index f12d43950..aec6352f0 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -64,7 +64,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen public SolrServer getServer() { return this.server; } - + @Override public void commit(final boolean softCommit) { synchronized (this.server) { diff --git a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java index 6b9b7a939..1d49fd537 100644 --- a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java +++ b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java @@ -24,7 +24,6 @@ import java.util.Collection; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.federate.solr.connector.CachedSolrConnector; import net.yacy.cora.federate.solr.connector.ConcurrentUpdateSolrConnector; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.connector.MirrorSolrConnector; @@ -161,9 +160,9 @@ public class InstanceMirror { return msc; } - public void clearCache() { + public void clearCaches() { for (SolrConnector csc: this.connectorCache.values()) { - if (csc instanceof CachedSolrConnector) ((CachedSolrConnector) csc).clearCache(); + csc.clearCaches(); } for (EmbeddedSolrConnector ssc: this.embeddedCache.values()) ssc.commit(true); } diff --git a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java index 19125afda..a4d3c38be 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java @@ -1,195 +1,193 @@ -/** - * HTMLResponseWriter - * Copyright 2013 by Michael Peter Christen - * First released 09.06.2013 at http://yacy.net - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.cora.federate.solr.responsewriter; - -import java.io.IOException; -import java.io.Writer; -import java.util.Date; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - -import net.yacy.cora.federate.solr.SolrType; -import net.yacy.search.schema.CollectionSchema; - -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexableField; -import org.apache.solr.common.params.SolrParams; -import org.apache.solr.common.util.NamedList; -import org.apache.solr.common.util.XML; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.response.QueryResponseWriter; -import org.apache.solr.response.ResultContext; -import org.apache.solr.response.SolrQueryResponse; -import org.apache.solr.schema.FieldType; -import org.apache.solr.schema.IndexSchema; -import org.apache.solr.schema.SchemaField; -import org.apache.solr.schema.TextField; -import org.apache.solr.search.DocIterator; -import org.apache.solr.search.DocList; -import org.apache.solr.search.SolrIndexSearcher; - -public class HTMLResponseWriter implements QueryResponseWriter { - - private static final Set DEFAULT_FIELD_LIST = null; - private static final Pattern dqp = Pattern.compile("\""); - - public HTMLResponseWriter() { - super(); - } - - @Override - public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) { - return "text/html"; - } - - @Override - public void init(@SuppressWarnings("rawtypes") NamedList n) { - } - - @Override - public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException { - NamedList values = rsp.getValues(); - assert values.get("responseHeader") != null; - assert values.get("response") != null; - - writer.write("\n"); - //writer.write("\n"); - writer.write("\n"); - writer.write("\n"); - //writer.write("\n"); - writer.write("\n"); - writer.write("\n"); - NamedList paramsList = request.getOriginalParams().toNamedList(); - paramsList.remove("wt"); - String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); - writer.write("
\"API\"\n"); - writer.write("This search result can also be retrieved as XML. Click the API icon to see this page as XML.
\n"); - - DocList response = ((ResultContext) values.get("response")).docs; - final int sz = response.size(); - if (sz > 0) { - SolrIndexSearcher searcher = request.getSearcher(); - DocIterator iterator = response.iterator(); - IndexSchema schema = request.getSchema(); - - int id = iterator.nextDoc(); - Document doc = searcher.doc(id, DEFAULT_FIELD_LIST); - LinkedHashMap tdoc = translateDoc(schema, doc); - - String title = tdoc.get(CollectionSchema.title.getSolrFieldName()); - if (sz == 1) { - writer.write("" + title + "\n\n"); - } else { - writer.write("Document List\n\n"); - } - writer.write("
\"API\"\n"); - writer.write("This search result can also be retrieved as XML. Click the API icon to see this page as XML.
\n"); - - writeDoc(writer, tdoc, title); - - while (iterator.hasNext()) { - id = iterator.nextDoc(); - doc = searcher.doc(id, DEFAULT_FIELD_LIST); - tdoc = translateDoc(schema, doc); - title = tdoc.get(CollectionSchema.title.getSolrFieldName()); - writeDoc(writer, tdoc, title); - } - } else { - writer.write("No Document Found\n\n"); - } - - writer.write("\n"); - } - - private static final void writeDoc(Writer writer, LinkedHashMap tdoc, String title) throws IOException { - writer.write("
\n"); - writer.write("
\n"); - writer.write("

" + title + "

\n"); - writer.write("
\n"); - for (Map.Entry entry: tdoc.entrySet()) { - writer.write("
"); - writer.write(entry.getKey()); - writer.write("
"); - XML.escapeAttributeValue(entry.getValue(), writer); - writer.write("
\n"); - } - writer.write("
\n"); - writer.write("
\n"); - writer.write("
\n"); - } - - static final LinkedHashMap translateDoc(final IndexSchema schema, final Document doc) { - List fields = doc.getFields(); - int sz = fields.size(); - int fidx1 = 0, fidx2 = 0; - LinkedHashMap kv = new LinkedHashMap(); - while (fidx1 < sz) { - IndexableField value = fields.get(fidx1); - String fieldName = value.name(); - fidx2 = fidx1 + 1; - while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) { - fidx2++; - } - SchemaField sf = schema.getFieldOrNull(fieldName); - if (sf == null) sf = new SchemaField(fieldName, new TextField()); - FieldType type = sf.getType(); - - if (fidx1 + 1 == fidx2) { - if (sf.multiValued()) { - String sv = value.stringValue(); - kv.put(fieldName, field2string(type, sv)); - } else { - kv.put(fieldName, field2string(type, value.stringValue())); - } - } else { - for (int i = fidx1; i < fidx2; i++) { - String sv = fields.get(i).stringValue(); - kv.put(fieldName + "_" + i, field2string(type, sv)); - } - } - - fidx1 = fidx2; - } - return kv; - } - - @SuppressWarnings("deprecation") - private static String field2string(final FieldType type, final String value) { - String typeName = type.getTypeName(); - if (typeName.equals(SolrType.bool.printName())) { - return "F".equals(value) ? "false" : "true"; - } else if (typeName.equals(SolrType.date.printName())) { - return org.apache.solr.schema.DateField.formatExternal(new Date(Long.parseLong(value))); // this is declared deprecated in solr 4.2.1 but is still used as done here - } - return value; - } - - // XML.escapeCharData(val, writer); -} +/** + * HTMLResponseWriter + * Copyright 2013 by Michael Peter Christen + * First released 09.06.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.responsewriter; + +import java.io.IOException; +import java.io.Writer; +import java.util.Date; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +import net.yacy.cora.federate.solr.SolrType; +import net.yacy.search.schema.CollectionSchema; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexableField; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.XML; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.QueryResponseWriter; +import org.apache.solr.response.ResultContext; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.TextField; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocList; +import org.apache.solr.search.SolrIndexSearcher; + +public class HTMLResponseWriter implements QueryResponseWriter { + + private static final Set DEFAULT_FIELD_LIST = null; + private static final Pattern dqp = Pattern.compile("\""); + + public HTMLResponseWriter() { + super(); + } + + @Override + public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) { + return "text/html"; + } + + @Override + public void init(@SuppressWarnings("rawtypes") NamedList n) { + } + + @Override + public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException { + NamedList values = rsp.getValues(); + assert values.get("responseHeader") != null; + assert values.get("response") != null; + + writer.write("\n"); + //writer.write("\n"); + writer.write("\n"); + writer.write("\n"); + //writer.write("\n"); + writer.write("\n"); + writer.write("\n"); + NamedList paramsList = request.getOriginalParams().toNamedList(); + paramsList.remove("wt"); + String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); + + DocList response = ((ResultContext) values.get("response")).docs; + final int sz = response.size(); + if (sz > 0) { + SolrIndexSearcher searcher = request.getSearcher(); + DocIterator iterator = response.iterator(); + IndexSchema schema = request.getSchema(); + + int id = iterator.nextDoc(); + Document doc = searcher.doc(id, DEFAULT_FIELD_LIST); + LinkedHashMap tdoc = translateDoc(schema, doc); + + String title = tdoc.get(CollectionSchema.title.getSolrFieldName()); + if (sz == 1) { + writer.write("" + title + "\n\n"); + } else { + writer.write("Document List\n\n"); + } + writer.write("
\"API\"\n"); + writer.write("This search result can also be retrieved as XML. Click the API icon to see this page as XML.
\n"); + + writeDoc(writer, tdoc, title); + + while (iterator.hasNext()) { + id = iterator.nextDoc(); + doc = searcher.doc(id, DEFAULT_FIELD_LIST); + tdoc = translateDoc(schema, doc); + title = tdoc.get(CollectionSchema.title.getSolrFieldName()); + writeDoc(writer, tdoc, title); + } + } else { + writer.write("No Document Found\n\n"); + } + + writer.write("\n"); + } + + private static final void writeDoc(Writer writer, LinkedHashMap tdoc, String title) throws IOException { + writer.write("
\n"); + writer.write("
\n"); + writer.write("

" + title + "

\n"); + writer.write("
\n"); + for (Map.Entry entry: tdoc.entrySet()) { + writer.write("
"); + writer.write(entry.getKey()); + writer.write("
"); + XML.escapeAttributeValue(entry.getValue(), writer); + writer.write("
\n"); + } + writer.write("
\n"); + writer.write("
\n"); + writer.write("
\n"); + } + + static final LinkedHashMap translateDoc(final IndexSchema schema, final Document doc) { + List fields = doc.getFields(); + int sz = fields.size(); + int fidx1 = 0, fidx2 = 0; + LinkedHashMap kv = new LinkedHashMap(); + while (fidx1 < sz) { + IndexableField value = fields.get(fidx1); + String fieldName = value.name(); + fidx2 = fidx1 + 1; + while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) { + fidx2++; + } + SchemaField sf = schema.getFieldOrNull(fieldName); + if (sf == null) sf = new SchemaField(fieldName, new TextField()); + FieldType type = sf.getType(); + + if (fidx1 + 1 == fidx2) { + if (sf.multiValued()) { + String sv = value.stringValue(); + kv.put(fieldName, field2string(type, sv)); + } else { + kv.put(fieldName, field2string(type, value.stringValue())); + } + } else { + for (int i = fidx1; i < fidx2; i++) { + String sv = fields.get(i).stringValue(); + kv.put(fieldName + "_" + i, field2string(type, sv)); + } + } + + fidx1 = fidx2; + } + return kv; + } + + @SuppressWarnings("deprecation") + private static String field2string(final FieldType type, final String value) { + String typeName = type.getTypeName(); + if (typeName.equals(SolrType.bool.printName())) { + return "F".equals(value) ? "false" : "true"; + } else if (typeName.equals(SolrType.date.printName())) { + return org.apache.solr.schema.DateField.formatExternal(new Date(Long.parseLong(value))); // this is declared deprecated in solr 4.2.1 but is still used as done here + } + return value; + } + + // XML.escapeCharData(val, writer); +} diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index a3b5da25f..211fa2e50 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -55,6 +55,7 @@ import net.yacy.crawler.retrieval.HTTPLoader; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.crawler.robots.RobotsTxt; +import net.yacy.document.TextParser; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.workflow.WorkflowProcessor; @@ -347,17 +348,10 @@ public final class CrawlStacker { // check availability of parser and maxfilesize String warning = null; - boolean loadImages = Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true); - if (!loadImages && Switchboard.getSwitchboard().getConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, "").equals("true;")) { - // dammit semicolon - // TODO: remove this shit later - Switchboard.getSwitchboard().setConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true); - loadImages = true; - } ContentDomain contentDomain = entry.url().getContentDomainFromExt(); if ((maxFileSize >= 0 && entry.size() > maxFileSize) || contentDomain == ContentDomain.APP || - (!loadImages && contentDomain == ContentDomain.IMAGE) || + (contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) || contentDomain == ContentDomain.AUDIO || contentDomain == ContentDomain.VIDEO || contentDomain == ContentDomain.CTRL) { diff --git a/source/net/yacy/crawler/data/Cache.java b/source/net/yacy/crawler/data/Cache.java index f1d72354f..9973f08a0 100644 --- a/source/net/yacy/crawler/data/Cache.java +++ b/source/net/yacy/crawler/data/Cache.java @@ -182,6 +182,14 @@ public final class Cache { public static long getActualCacheSize() { return fileDBunbuffered.length(); } + + /** + * get the current actual cache size + * @return + */ + public static long getActualCacheDocCount() { + return fileDBunbuffered.size(); + } /** * close the databases diff --git a/source/net/yacy/data/BookmarksDB.java b/source/net/yacy/data/BookmarksDB.java index 1c11b4b15..d9c0140a6 100644 --- a/source/net/yacy/data/BookmarksDB.java +++ b/source/net/yacy/data/BookmarksDB.java @@ -41,7 +41,10 @@ import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.blob.MapHeap; +import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.index.RowHandleSet; public class BookmarksDB { @@ -147,11 +150,6 @@ public class BookmarksDB { ConcurrentLog.logException(e); } } - public String addBookmark(final Bookmark bookmark){ - saveBookmark(bookmark); - return bookmark.getUrlHash(); - - } public Bookmark getBookmark(final String urlHash) throws IOException { try { @@ -214,18 +212,13 @@ public class BookmarksDB { final TreeSet set=new TreeSet(new bookmarkComparator(true)); final String tagHash=BookmarkHelper.tagHash(tagName); final Tag tag=getTag(tagHash); - Set hashes=new HashSet(); - if (tag != null) { - hashes=getTag(tagHash).getUrlHashes(); - } + RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes(); if (priv) { - set.addAll(hashes); + for (byte[] hash: hashes) set.add(ASCII.String(hash)); } else { - final Iterator it=hashes.iterator(); - Bookmark bm; - while(it.hasNext()){ + for (byte[] hash: hashes) { try { - bm = getBookmark(it.next()); + Bookmark bm = getBookmark(ASCII.String(hash)); if (bm != null && bm.getPublic()) { set.add(bm.getUrlHash()); } @@ -249,7 +242,7 @@ public class BookmarksDB { * retrieve an object of type Tag from the the tagCache, if object is not cached return loadTag(hash) * @param hash an object of type String, containing a tagHash */ - public Tag getTag(final String hash){ + private Tag getTag(final String hash){ return this.tags.get(hash); //null if it does not exists } @@ -257,7 +250,7 @@ public class BookmarksDB { * store a Tag in tagsTable or remove an empty tag * @param tag an object of type Tag to be stored/removed */ - public void putTag(final Tag tag){ + private void putTag(final Tag tag){ if (tag == null) return; if (tag.isEmpty()) { this.tags.remove(tag.getTagHash()); @@ -266,7 +259,7 @@ public class BookmarksDB { } } - public void removeTag(final String hash) { + private void removeTag(final String hash) { this.tags.remove(hash); } @@ -301,7 +294,7 @@ public class BookmarksDB { return set.iterator(); } - public Iterator getTagIterator(final String tagName, final boolean priv, final int comp) { + private Iterator getTagIterator(final String tagName, final boolean priv, final int comp) { final TreeSet set=new TreeSet((comp == SORT_SIZE) ? tagSizeComparator : tagComparator); Iterator it=null; final Iterator bit=getBookmarksIterator(tagName, priv); @@ -347,14 +340,14 @@ public class BookmarksDB { final Tag oldTag=getTag(BookmarkHelper.tagHash(oldName)); if (oldTag != null) { - final Set urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag + final RowHandleSet urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag removeTag(BookmarkHelper.tagHash(oldName)); // remove oldHash from TagsDB Bookmark bookmark; Set tagSet = new TreeSet(String.CASE_INSENSITIVE_ORDER); - for (final String urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName + for (final byte[] urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName try { - bookmark = getBookmark(urlHash); + bookmark = getBookmark(ASCII.String(urlHash)); tagSet = bookmark.getTags(); tagSet.remove(oldName); bookmark.setTags(tagSet, true); // might not be needed, but doesn't hurt @@ -371,9 +364,9 @@ public class BookmarksDB { public void addTag(final String selectTag, final String newTag) { Bookmark bookmark; - for (final String urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag + for (final byte[] urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag try { - bookmark = getBookmark(urlHash); + bookmark = getBookmark(ASCII.String(urlHash)); bookmark.addTag(newTag); saveBookmark(bookmark); } catch (final IOException e) { @@ -389,51 +382,24 @@ public class BookmarksDB { * Subclass of bookmarksDB, which provides the Tag object-type */ public class Tag { - public static final String URL_HASHES = "urlHashes"; - public static final String TAG_NAME = "tagName"; private final String tagHash; - private final Map mem; - private Set urlHashes; - - public Tag(final String hash, final Map map){ - this.tagHash = hash; - this.mem = map; - if (this.mem.containsKey(URL_HASHES)) { - this.urlHashes = ListManager.string2set(this.mem.get(URL_HASHES)); - } else { - this.urlHashes = new HashSet(); - } - } + private final String tagName; + private RowHandleSet urlHashes; - public Tag(final String name, final HashSet entries){ + private Tag(final String name) { this.tagHash = BookmarkHelper.tagHash(name); - this.mem = new HashMap(); - //mem.put(URL_HASHES, listManager.arraylist2string(entries)); - this.urlHashes = entries; - this.mem.put(TAG_NAME, name); - } - - public Tag(final String name){ - this(name, new HashSet()); - } - - public Map getMap(){ - this.mem.put(URL_HASHES, ListManager.collection2string(this.urlHashes)); - return this.mem; + this.tagName = name; + this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); } /** * get the lowercase Tagname */ public String getTagName(){ - /*if(this.mem.containsKey(TAG_NAME)){ - return (String) this.mem.get(TAG_NAME); - } - return "";*/ return getFriendlyName().toLowerCase(); } - public String getTagHash(){ + private String getTagHash(){ return this.tagHash; } @@ -441,37 +407,33 @@ public class BookmarksDB { * @return the tag name, with all uppercase chars */ public String getFriendlyName(){ - /*if(this.mem.containsKey(TAG_FRIENDLY_NAME)){ - return (String) this.mem.get(TAG_FRIENDLY_NAME); - } - return getTagName();*/ - if(this.mem.containsKey(TAG_NAME)){ - return this.mem.get(TAG_NAME); - } - return "notagname"; + return this.tagName; } - public Set getUrlHashes(){ + private RowHandleSet getUrlHashes(){ return this.urlHashes; } - public boolean hasPublicItems(){ + private boolean hasPublicItems(){ return getBookmarksIterator(getTagName(), false).hasNext(); } - public void addUrl(final String urlHash){ - this.urlHashes.add(urlHash); + private void addUrl(final String urlHash){ + try { + this.urlHashes.put(ASCII.getBytes(urlHash)); + } catch (SpaceExceededException e) { + } } - public void delete(final String urlHash){ - this.urlHashes.remove(urlHash); + private void delete(final String urlHash){ + this.urlHashes.remove(ASCII.getBytes(urlHash)); } public int size(){ return this.urlHashes.size(); } - public boolean isEmpty() { + private boolean isEmpty() { return this.urlHashes.isEmpty(); } } @@ -481,27 +443,19 @@ public class BookmarksDB { */ public class Bookmark { - public static final String BOOKMARK_URL = "bookmarkUrl"; + private static final String BOOKMARK_URL = "bookmarkUrl"; public static final String BOOKMARK_TITLE = "bookmarkTitle"; public static final String BOOKMARK_DESCRIPTION = "bookmarkDesc"; - public static final String BOOKMARK_TAGS = "bookmarkTags"; - public static final String BOOKMARK_PUBLIC = "bookmarkPublic"; - public static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp"; - public static final String BOOKMARK_OWNER = "bookmarkOwner"; - public static final String BOOKMARK_IS_FEED = "bookmarkIsFeed"; + private static final String BOOKMARK_TAGS = "bookmarkTags"; + private static final String BOOKMARK_PUBLIC = "bookmarkPublic"; + private static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp"; + private static final String BOOKMARK_OWNER = "bookmarkOwner"; + private static final String BOOKMARK_IS_FEED = "bookmarkIsFeed"; private final String urlHash; private Set tagNames; private long timestamp; private final Map entry; - public Bookmark(final String urlHash, final Map map) { - this.entry = map; - this.urlHash = urlHash; - this.tagNames = new TreeSet(String.CASE_INSENSITIVE_ORDER); - if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS))); - loadTimestamp(); - } - public Bookmark(final DigestURL url) { this.entry = new HashMap(); this.urlHash = ASCII.String(url.hash()); @@ -529,11 +483,15 @@ public class BookmarksDB { this(new DigestURL((url.indexOf("://") < 0) ? "http://" + url : url)); } - public Bookmark(final Map map) throws MalformedURLException { - this(ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash()), map); + private Bookmark(final Map map) throws MalformedURLException { + this.entry = map; + this.urlHash = ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash()); + this.tagNames = new TreeSet(String.CASE_INSENSITIVE_ORDER); + if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS))); + loadTimestamp(); } - Map toMap() { + private Map toMap() { this.entry.put(BOOKMARK_TAGS, ListManager.collection2string(this.tagNames)); this.entry.put(BOOKMARK_TIMESTAMP, String.valueOf(this.timestamp)); return this.entry; @@ -688,11 +646,11 @@ public class BookmarksDB { /** * Subclass of bookmarksDB, which provides the bookmarkIterator object-type */ - public class bookmarkIterator implements Iterator { + private class bookmarkIterator implements Iterator { Iterator bookmarkIter; - public bookmarkIterator(final boolean up) throws IOException { + private bookmarkIterator(final boolean up) throws IOException { //flushBookmarkCache(); //XXX: this will cost performance this.bookmarkIter = BookmarksDB.this.bookmarks.keys(up, false); //this.nextEntry = null; @@ -722,14 +680,14 @@ public class BookmarksDB { /** * Comparator to sort objects of type Bookmark according to their timestamps */ - public class bookmarkComparator implements Comparator { + private class bookmarkComparator implements Comparator { private final boolean newestFirst; /** * @param newestFirst newest first, or oldest first? */ - public bookmarkComparator(final boolean newestFirst){ + private bookmarkComparator(final boolean newestFirst){ this.newestFirst = newestFirst; } @@ -752,13 +710,13 @@ public class BookmarksDB { } } - public static final TagComparator tagComparator = new TagComparator(); - public static final TagSizeComparator tagSizeComparator = new TagSizeComparator(); + private static final TagComparator tagComparator = new TagComparator(); + private static final TagSizeComparator tagSizeComparator = new TagSizeComparator(); /** * Comparator to sort objects of type Tag according to their names */ - public static class TagComparator implements Comparator, Serializable { + private static class TagComparator implements Comparator, Serializable { /** * generated serial @@ -772,7 +730,7 @@ public class BookmarksDB { } - public static class TagSizeComparator implements Comparator, Serializable { + private static class TagSizeComparator implements Comparator, Serializable { /** * generated serial diff --git a/source/net/yacy/document/parser/html/CharacterCoding.java b/source/net/yacy/document/parser/html/CharacterCoding.java index 213c975b0..f93300cbd 100644 --- a/source/net/yacy/document/parser/html/CharacterCoding.java +++ b/source/net/yacy/document/parser/html/CharacterCoding.java @@ -26,12 +26,15 @@ package net.yacy.document.parser.html; import java.util.HashMap; import java.util.Map; +import java.util.regex.Pattern; /** * Contains methods to convert between Unicode and XML/HTML encoding. */ public final class CharacterCoding { + /** Ampersand pattern */ + public final static Pattern ampPattern = Pattern.compile(Pattern.quote("&")); /** Ampersand character in unicode encoding. */ private static final char AMP_UNICODE = "\u0026".charAt(0); /** Ampersand character in HTML encoding. */ @@ -276,14 +279,15 @@ public final class CharacterCoding { } return sb.toString(); } - + /** * Replaces HTML-encoded characters with unicode representation. * @param text text with character to replace * @return text with replaced characters */ - public static String html2unicode(final String text) { + public static String html2unicode(String text) { if (text == null) return null; + text = ampPattern.matcher(text).replaceAll("&"); // sometimes a double-replacement is necessary. int p = 0, p1, q; final StringBuilder sb = new StringBuilder(text.length()); String s; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 66931a720..285cf26a1 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -204,11 +204,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { } @Override - public void scrapeText(final char[] newtext, final String insideTag) { + public void scrapeText(final char[] newtext0, final String insideTag) { // System.out.println("SCRAPE: " + UTF8.String(newtext)); if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return; int p, pl, q, s = 0; - + char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray(); + // match evaluation pattern this.evaluationScores.match(Element.text, newtext); @@ -466,7 +467,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); if (tagname.equalsIgnoreCase("a") && text.length < 2048) { - final String href = tagopts.getProperty("href", EMPTY_STRING); + String href = tagopts.getProperty("href", EMPTY_STRING); + href = CharacterCoding.html2unicode(href); AnchorURL url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 72181ca7a..d74114180 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -32,27 +32,15 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; +import java.lang.reflect.Method; import java.util.Date; -import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; -import org.apache.pdfbox.pdmodel.font.PDCIDFont; -import org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font; -import org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDMMType1Font; -import org.apache.pdfbox.pdmodel.font.PDSimpleFont; -import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; -import org.apache.pdfbox.pdmodel.font.PDType0Font; -import org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont; -import org.apache.pdfbox.pdmodel.font.PDType1CFont; -import org.apache.pdfbox.pdmodel.font.PDType1Font; -import org.apache.pdfbox.pdmodel.font.PDType3Font; import org.apache.pdfbox.util.PDFTextStripper; import net.yacy.cora.document.id.AnchorURL; @@ -222,25 +210,54 @@ public class pdfParser extends AbstractParser implements Parser { false, docDate)}; } - - @SuppressWarnings("static-access") + public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() { // thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever // AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT! // Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain. - PDFont.clearResources(); - COSName.clearResources(); - PDType1Font.clearResources(); - PDTrueTypeFont.clearResources(); - PDType0Font.clearResources(); - PDType1AfmPfbFont.clearResources(); - PDType3Font.clearResources(); - PDType1CFont.clearResources(); - PDCIDFont.clearResources(); - PDCIDFontType0Font.clearResources(); - PDCIDFontType2Font.clearResources(); - PDMMType1Font.clearResources(); - PDSimpleFont.clearResources(); + ResourceCleaner cl = new ResourceCleaner(); + cl.clearClassResources("org.apache.pdfbox.cos.COSName"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont"); + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + private static class ResourceCleaner { + Method findLoadedClass; + private ClassLoader sys; + public ResourceCleaner() { + try { + this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class }); + this.findLoadedClass.setAccessible(true); + this.sys = ClassLoader.getSystemClassLoader(); + } catch (Throwable e) { + e.printStackTrace(); + this.findLoadedClass = null; + this.sys = null; + } + } + public void clearClassResources(String name) { + if (this.findLoadedClass == null) return; + try { + Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name); + if (pdfparserpainclass != null) { + Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {}); + if (clearResources != null) clearResources.invoke(null); + } + } catch (Throwable e) { + e.printStackTrace(); + } + } } /** diff --git a/source/net/yacy/http/CrashProtectionHandler.java b/source/net/yacy/http/CrashProtectionHandler.java index 257780f61..bbb4fb917 100644 --- a/source/net/yacy/http/CrashProtectionHandler.java +++ b/source/net/yacy/http/CrashProtectionHandler.java @@ -37,12 +37,12 @@ public class CrashProtectionHandler extends HandlerWrapper implements Handler, H } private void writeResponse(HttpServletRequest request, HttpServletResponse response, Exception exc) throws IOException { - PrintWriter out = response.getWriter(); - out.println("Ops!"); - out.println(); - out.println("Message: " + exc.getMessage()); - exc.printStackTrace(out); - response.setContentType("text/plain"); - response.setStatus(500); + PrintWriter out = response.getWriter(); + out.println("Ops!"); + out.println(); + out.println("Message: " + exc.getMessage()); + exc.printStackTrace(out); + response.setContentType("text/plain"); + response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); } } diff --git a/source/net/yacy/http/ProxyHandler.java b/source/net/yacy/http/ProxyHandler.java index acef4b3c0..7d7d208df 100644 --- a/source/net/yacy/http/ProxyHandler.java +++ b/source/net/yacy/http/ProxyHandler.java @@ -91,7 +91,7 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler { HttpServletResponse response) throws IOException, ServletException { RequestHeader proxyHeaders = convertHeaderFromJetty(request); - final String httpVer = (String) request.getHeader(HeaderFramework.CONNECTION_PROP_HTTP_VER); + final String httpVer = request.getHeader(HeaderFramework.CONNECTION_PROP_HTTP_VER); setViaHeader (proxyHeaders, httpVer); proxyHeaders.remove(RequestHeader.KEEP_ALIVE); proxyHeaders.remove(RequestHeader.CONTENT_LENGTH); diff --git a/source/net/yacy/http/SSIHandler.java b/source/net/yacy/http/SSIHandler.java index 314b747d0..095861368 100644 --- a/source/net/yacy/http/SSIHandler.java +++ b/source/net/yacy/http/SSIHandler.java @@ -27,7 +27,6 @@ package net.yacy.http; import java.io.IOException; import java.io.OutputStream; -import javax.servlet.RequestDispatcher; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; diff --git a/source/net/yacy/http/TemplateHandler.java b/source/net/yacy/http/TemplateHandler.java index 156489561..d13e0e00d 100644 --- a/source/net/yacy/http/TemplateHandler.java +++ b/source/net/yacy/http/TemplateHandler.java @@ -97,11 +97,6 @@ public class TemplateHandler extends AbstractHandler implements Handler { htDocsPath = Switchboard.getSwitchboard().htDocsPath.getPath(); } - @Override - protected void doStop() throws Exception { - super.doStop(); - } - /** Returns a path to the localized or default file according to the parameter localeSelection * @param path relative from htroot * @param localeSelection language of localized file; locale.language from switchboard is used if localeSelection.equals("") */ diff --git a/source/net/yacy/http/YaCyHttpServer.java b/source/net/yacy/http/YaCyHttpServer.java index 22c881aa7..e869c0572 100644 --- a/source/net/yacy/http/YaCyHttpServer.java +++ b/source/net/yacy/http/YaCyHttpServer.java @@ -17,13 +17,13 @@ import java.net.SocketException; */ public interface YaCyHttpServer { - abstract public void startupServer() throws Exception; - abstract public void stop() throws Exception; - abstract public void setMaxSessionCount(int cnt); - abstract public InetSocketAddress generateSocketAddress(String port) throws SocketException; - abstract public int getMaxSessionCount(); - abstract public int getJobCount(); - abstract public boolean withSSL(); - abstract public void reconnect(int milsec); - abstract public String getVersion(); + abstract void startupServer() throws Exception; + abstract void stop() throws Exception; + abstract void setMaxSessionCount(int cnt); + abstract InetSocketAddress generateSocketAddress(String port) throws SocketException; + abstract int getMaxSessionCount(); + abstract int getJobCount(); + abstract boolean withSSL(); + abstract void reconnect(int milsec); + abstract String getVersion(); } diff --git a/source/net/yacy/peers/Transmission.java b/source/net/yacy/peers/Transmission.java index 47799fd7c..44fd4a807 100644 --- a/source/net/yacy/peers/Transmission.java +++ b/source/net/yacy/peers/Transmission.java @@ -25,7 +25,6 @@ package net.yacy.peers; import java.util.ArrayList; -import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -164,7 +163,7 @@ public class Transmission { final ReferenceContainer c = (remaining >= container.size()) ? container : trimContainer(container, remaining); // iterate through the entries in the container and check if the reference is in the repository final List notFoundx = new ArrayList(); - Collection testids = new HashSet(); + Set testids = new HashSet(); Iterator i = c.entries(); while (i.hasNext()) { final WordReference e = i.next(); diff --git a/source/net/yacy/search/ResourceObserver.java b/source/net/yacy/search/ResourceObserver.java index 9cc6a58e7..32e8d2396 100644 --- a/source/net/yacy/search/ResourceObserver.java +++ b/source/net/yacy/search/ResourceObserver.java @@ -129,7 +129,7 @@ public class ResourceObserver { if(MemoryControl.properState()) return Space.HIGH; // clear some caches - @all: are there more of these, we could clear here? - this.sb.index.clearCache(); + this.sb.index.clearCaches(); SearchEventCache.cleanupEvents(true); this.sb.trail.clear(); Switchboard.urlBlacklist.clearblacklistCache(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index e392cab37..0307b7e01 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1585,7 +1585,7 @@ public final class Switchboard extends serverSwitch { * @param ids a collection of url hashes * @return a map from the hash id to: if it exists, the name of the database, otherwise null */ - public Map urlExists(final Collection ids) { + public Map urlExists(final Set ids) { Set e = this.index.exists(ids); Map m = new HashMap(); for (String id: ids) { @@ -2031,7 +2031,7 @@ public final class Switchboard extends serverSwitch { // clear caches if necessary if ( !MemoryControl.request(128000000L, false) ) { - this.index.clearCache(); + this.index.clearCaches(); SearchEventCache.cleanupEvents(false); this.trail.clear(); GuiHandler.clear(); @@ -2556,12 +2556,16 @@ public final class Switchboard extends serverSwitch { ) { // get the hyperlinks final Map hl = Document.getHyperlinks(documents); - boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true); - if (loadImages) hl.putAll(Document.getImagelinks(documents)); + for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) { + if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue()); + } + // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links if (response.profile().directDocByURL()) { - if (!loadImages) hl.putAll(Document.getImagelinks(documents)); + for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) { + if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue()); + } hl.putAll(Document.getApplinks(documents)); hl.putAll(Document.getVideolinks(documents)); hl.putAll(Document.getAudiolinks(documents)); @@ -2905,7 +2909,7 @@ public final class Switchboard extends serverSwitch { // stacking may fail because of double occurrences of that url. Therefore // we must wait here until the url has actually disappeared int t = 100; - Collection ids = new ArrayList(1); ids.add(ASCII.String(urlhash)); + Set ids = new HashSet(1); ids.add(ASCII.String(urlhash)); while (t-- > 0 && this.index.exists(ids).size() > 0) { try {Thread.sleep(100);} catch (final InterruptedException e) {} ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index a4ac06708..d6dfd24c8 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -323,7 +323,6 @@ public final class SwitchboardConstants { *

public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"

*

Name of the setting how many active crawler-threads may maximal be running on the same time

*/ - public static final String CRAWLER_LOAD_IMAGE = "crawler.load.image"; public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 46389247c..cc127ecbe 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -225,10 +225,10 @@ public final class Fulltext { } } - public void clearCache() { + public void clearCaches() { if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); if (this.statsDump != null) this.statsDump.clear(); - this.solrInstances.clearCache(); + this.solrInstances.clearCaches(); this.statsDump = null; } @@ -250,7 +250,7 @@ public final class Fulltext { for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear(); } this.commit(false); - this.solrInstances.clearCache(); + this.solrInstances.clearCaches(); } } @@ -260,7 +260,7 @@ public final class Fulltext { if (instance != null) { for (String name: instance.getCoreNames()) new RemoteSolrConnector(instance, name).clear(); } - this.solrInstances.clearCache(); + this.solrInstances.clearCaches(); } } @@ -400,7 +400,7 @@ public final class Fulltext { throw new IOException(e.getMessage(), e); } this.statsDump = null; - if (MemoryControl.shortStatus()) clearCache(); + if (MemoryControl.shortStatus()) clearCaches(); } public void putEdges(final Collection edges) throws IOException { @@ -412,7 +412,7 @@ public final class Fulltext { throw new IOException(e.getMessage(), e); } this.statsDump = null; - if (MemoryControl.shortStatus()) clearCache(); + if (MemoryControl.shortStatus()) clearCaches(); } /** @@ -432,7 +432,7 @@ public final class Fulltext { throw new IOException(e.getMessage(), e); } this.statsDump = null; - if (MemoryControl.shortStatus()) clearCache(); + if (MemoryControl.shortStatus()) clearCaches(); } /** @@ -617,10 +617,11 @@ public final class Fulltext { * @param ids * @return a set of ids which exist in the database */ - public Set exists(Collection ids) { + public Set exists(Set ids) { HashSet e = new HashSet(); if (ids == null || ids.size() == 0) return e; - Collection idsC = new HashSet(); + if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e; + Set idsC = new HashSet(); idsC.addAll(ids); if (this.urlIndexFile != null) { Iterator idsi = idsC.iterator(); @@ -751,12 +752,12 @@ public final class Fulltext { } // export methods - public Export export(final File f, final String filter, final int format, final boolean dom) { + public Export export(final File f, final String filter, final String query, final int format, final boolean dom) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); return this.exportthread; } - this.exportthread = new Export(f, filter, format, dom); + this.exportthread = new Export(f, filter, query, format, dom); this.exportthread.start(); return this.exportthread; } @@ -769,14 +770,15 @@ public final class Fulltext { private final File f; private final Pattern pattern; private int count; - private String failure; + private String failure, query; private final int format; private final boolean dom; - private Export(final File f, final String filter, final int format, boolean dom) { + private Export(final File f, final String filter, final String query, final int format, boolean dom) { // format: 0=text, 1=html, 2=rss/xml this.f = f; this.pattern = filter == null ? null : Pattern.compile(filter); + this.query = query == null? "*:*" : query; this.count = 0; this.failure = null; this.format = format; @@ -805,7 +807,7 @@ public final class Fulltext { if (this.dom) { - Map> scores = Fulltext.this.getDefaultConnector().getFacets(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); + Map> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); ReversibleScoreMap stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); for (final String host: stats) { if (this.pattern != null && !this.pattern.matcher(host).matches()) continue; @@ -814,21 +816,19 @@ public final class Fulltext { this.count++; } } else { - BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, + BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); SolrDocument doc; - ArrayList title; - String url, author, hash; - String[] descriptions; + String url, hash, title, author, description; Integer size; Date date; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); - title = (ArrayList) doc.getFieldValue(CollectionSchema.title.getSolrFieldName()); - author = (String) doc.getFieldValue(CollectionSchema.author.getSolrFieldName()); - descriptions = (String[]) doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()); + hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName())); + author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName())); + description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName())); size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; @@ -836,16 +836,14 @@ public final class Fulltext { pw.println(url); } if (this.format == 1) { - if (title != null) pw.println("" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + ""); + if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); } if (this.format == 2) { pw.println(""); - if (title != null) pw.println("" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + ""); + if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); pw.println("" + MultiProtocolURL.escape(url) + ""); if (author != null && !author.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(author, true) + ""); - if (descriptions != null && descriptions.length > 0) { - for (String d: descriptions) pw.println("" + CharacterCoding.unicode2xml(d, true) + ""); - } + if (description != null && !description.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(description, true) + ""); if (date != null) pw.println("" + HeaderFramework.formatRFC1123(date) + ""); if (size != null) pw.println("" + size.intValue() + ""); pw.println("" + hash + ""); @@ -883,6 +881,13 @@ public final class Fulltext { public int count() { return this.count; } + + @SuppressWarnings("unchecked") + private String getStringFrom(final Object o) { + if (o == null) return ""; + if (o instanceof ArrayList) return ((ArrayList) o).get(0); + return (String) o; + } } diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 918458837..617d5269c 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -29,7 +29,6 @@ package net.yacy.search.index; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -443,7 +442,7 @@ public class Segment { * @param ids * @return a set of ids which exist in the database */ - public Set exists(final Collection ids) { + public Set exists(final Set ids) { return this.fulltext.exists(ids); } @@ -504,10 +503,10 @@ public class Segment { } } - public void clearCache() { + public void clearCaches() { if (this.urlCitationIndex != null) this.urlCitationIndex.clearCache(); if (this.termIndex != null) this.termIndex.clearCache(); - this.fulltext.clearCache(); + this.fulltext.clearCaches(); } public File getLocation() { diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 745bbb2ac..50861de59 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -242,7 +242,8 @@ public class QueryGoal { // add filter to prevent that results come from failed urls q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND ("); q.append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).append(":[* TO *] OR "); - q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif))"); + q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif) OR"); + q.append(CollectionSchema.content_type.getSolrFieldName()).append(":(image/*))"); // parse special requests if (isCatchall()) return q; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index adda277b2..31b3c35e2 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -898,17 +898,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(); hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); - if (hostscore == null) hostscore = new ClusteredScoreMap(); - + ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts"); + int countcheck = 0; for (String host: hostscore.keyList(true)) { // Patch the citation index for links with canonical tags. // This shall fulfill the following requirement: - // If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C. + // If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C. // To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]"; + long patchquerycount = collectionConnector.getCountByQuery(patchquery); BlockingQueue documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 60000L, 50, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName()); SolrDocument doc_B; + int patchquerycountcheck = 0; try { while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) { // find all documents which link to the canonical doc @@ -926,10 +928,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri CitationReference doc_A_citation = doc_A_ids_iterator.next(); segment.urlCitation().add(doc_C_url.hash(), doc_A_citation); } + patchquerycountcheck++; } } catch (InterruptedException e) { } catch (SpaceExceededException e) { } + if (patchquerycount != patchquerycountcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous patchquery count for host " + host + ": expected=" + patchquerycount + ", counted=" + patchquerycountcheck); // do the citation rank computation if (hostscore.get(host) <= 0) continue; @@ -939,12 +943,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri while (convergence_attempts++ < 30) { if (crh.convergenceStep()) break; } - ConcurrentLog.info("CollectionConfiguration.CRHost", "convergence for host " + host + " after " + convergence_attempts + " steps"); + ConcurrentLog.info("CollectionConfiguration", "convergence for host " + host + " after " + convergence_attempts + " steps"); // we have now the cr for all documents of a specific host; we store them for later use Map crn = crh.normalize(); //crh.log(crn); ranking.putAll(crn); // accumulate this here for usage in document update later + countcheck++; } + if (hostscore.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + hostscore.size() + ", counted=" + countcheck); } catch (final IOException e2) { hostscore = new ClusteredScoreMap(); } @@ -952,13 +958,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // process all documents at the webgraph for the outgoing links of this document SolrDocument doc; if (webgraphConnector != null) { - for (String host: hostscore.keyList(true)) { - if (hostscore.get(host) <= 0) continue; - // select all webgraph edges and modify their cr value - BlockingQueue docs = webgraphConnector.concurrentDocumentsByQuery( - WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"", - 0, 10000000, 60000, 50); - try { + try { + for (String host: hostscore.keyList(true)) { + if (hostscore.get(host) <= 0) continue; + // select all webgraph edges and modify their cr value + String query = WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\""; + long count = webgraphConnector.getCountByQuery(query); + ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph"); + BlockingQueue docs = webgraphConnector.concurrentDocumentsByQuery(query, 0, 10000000, 60000, 50); + int countcheck = 0; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { boolean changed = false; SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null); @@ -978,21 +986,29 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri webgraphConnector.add(sid); } catch (SolrException e) { } catch (IOException e) { - } + } + countcheck++; } - } catch (final InterruptedException e) {} + if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous webgraph document count for host " + host + ": expected=" + count + ", counted=" + countcheck); + } + } catch (final IOException e2) { + ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2); + } catch (final InterruptedException e3) { + ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3); } } // process all documents in collection - BlockingQueue docs = collectionConnector.concurrentDocumentsByQuery( - (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + - CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", - 0, 10000, 60000, 50); + String query = (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + + CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]"; int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id Set uniqueURLs = new HashSet(); try { + long count = collectionConnector.getCountByQuery(query); + ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey); + BlockingQueue docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000, 60000, 50); + int countcheck = 0; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { // for each to-be-processed entry work on the process tag Collection proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName()); @@ -1031,8 +1047,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (!hostExtentCache.containsKey(hosthash)) { StringBuilder q = new StringBuilder(); q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); - long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); - hostExtentCache.put(hosthash, count); + long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); + hostExtentCache.put(hosthash, hostExtentCount); } if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; @@ -1047,13 +1063,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri proccount++; } catch (final Throwable e1) { } + countcheck++; } + if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + proccount_clickdepthchange + " clickdepth changes, " + proccount_referencechange + " reference-count changes, " + proccount_uniquechange + " unique field changes, " + proccount_citationchange + " citation ranking changes."); - } catch (final InterruptedException e) { + } catch (final InterruptedException e2) { + ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2); + } catch (IOException e3) { + ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3); } return proccount; } @@ -1148,8 +1169,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (entry == null || entry.getValue() == null) continue; try { String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName()); - ConcurrentLog.info("CollectionConfiguration.CRHost", "CR for " + url); - ConcurrentLog.info("CollectionConfiguration.CRHost", ">> " + entry.getValue().toString()); + ConcurrentLog.info("CollectionConfiguration", "CR for " + url); + ConcurrentLog.info("CollectionConfiguration", ">> " + entry.getValue().toString()); } catch (final IOException e) { ConcurrentLog.logException(e); }