diff --git a/addon/YaCy.app/Contents/Info.plist b/addon/YaCy.app/Contents/Info.plist
index 3d0b80639..3fd554f7c 100644
--- a/addon/YaCy.app/Contents/Info.plist
+++ b/addon/YaCy.app/Contents/Info.plist
@@ -56,14 +56,14 @@
$JAVAROOT/lib/commons-io-2.1.jar
$JAVAROOT/lib/commons-jxpath-1.3.jar
$JAVAROOT/lib/commons-lang-2.6.jar
- $JAVAROOT/lib/commons-logging-1.1.1.jar
+ $JAVAROOT/lib/commons-logging-1.1.3.jar
$JAVAROOT/lib/fontbox-1.8.2.jar
$JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
$JAVAROOT/lib/guava-13.0.1.jar
$JAVAROOT/lib/htmllexer.jar
- $JAVAROOT/lib/httpclient-4.2.5.jar
- $JAVAROOT/lib/httpcore-4.2.4.jar
- $JAVAROOT/lib/httpmime-4.2.5.jar
+ $JAVAROOT/lib/httpclient-4.3.jar
+ $JAVAROOT/lib/httpcore-4.3.jar
+ $JAVAROOT/lib/httpmime-4.3.jar
$JAVAROOT/lib/icu4j-core.jar
$JAVAROOT/lib/iri-0.8.jar
$JAVAROOT/lib/J7Zip-modified.jar
diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema
index a8e814f93..8382892ed 100644
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@@ -90,6 +90,9 @@ clickdepth_i
## needed (post-)processing steps on this metadata set
process_sxt
+## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
+harvestkey_s
+
### optional but highly recommended values, part of the index distribution process
diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema
index 25afe61b3..0ba0cce70 100644
--- a/defaults/solr.webgraph.schema
+++ b/defaults/solr.webgraph.schema
@@ -26,6 +26,9 @@ collection_sxt
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
#process_sxt
+
+## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
+harvestkey_s
##
@@ -71,6 +74,10 @@ source_id_s
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
#source_clickdepth_i
+## copy of the citation rank norm value from the source link
+source_cr_host_norm_i
+
+
## host of the url (source)
#source_host_s
@@ -168,6 +175,10 @@ target_path_folders_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
#target_clickdepth_i
+## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
+target_cr_host_norm_i
+
+
## host of the url (target)
#target_host_s
diff --git a/defaults/yacy.init b/defaults/yacy.init
index 1174be8df..7339c3f02 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -772,7 +772,7 @@ search.result.show.tags = false
# search navigators: comma-separated list of default values for search navigation.
# can be temporary different if search string is given with differen navigation values
# assigning no value(s) means that no navigation is shown
-search.navigation=hosts,authors,namespace,topics,filetype,protocol
+search.navigation=location,hosts,authors,namespace,topics,filetype,protocol
# search result verification and snippet fetch caching rules
# each search result can be verified byloading the link from the web
diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
index 577db4b33..af5087377 100644
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@@ -105,6 +105,7 @@ public class CrawlProfileEditor_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes());
+ sb.crawler.removePassive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index f4458a560..60c9a8cbb 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -129,6 +129,7 @@ public class Crawler_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes());
+ sb.crawler.removePassive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java
index 03c79d4a5..2749a00b1 100644
--- a/htroot/DictionaryLoader_p.java
+++ b/htroot/DictionaryLoader_p.java
@@ -316,10 +316,18 @@ public class DictionaryLoader_p {
}
// check status again
+ boolean keepPlacesTagging = false;
for (final LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) {
- prop.put(dictionary.nickname + "Status", dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0);
+ int newstatus = dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0;
+ if (newstatus == 1) keepPlacesTagging = true;
+ prop.put(dictionary.nickname + "Status", newstatus);
}
+ // if all locations are deleted or deactivated, remove also the vocabulary
+ if (!keepPlacesTagging) {
+ LibraryProvider.autotagging.removePlaces();
+ }
+
return prop; // return rewrite values for templates
}
}
diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java
index b95d284d7..9f3b7ff95 100644
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@@ -635,7 +635,8 @@ public class IndexControlRWIs_p {
"",//userAgent
false,
false,
- 0.0d, 0.0d, 0.0d);
+ 0.0d, 0.0d, 0.0d,
+ new String[0]);
final SearchEvent theSearch = SearchEventCache.getEvent(query, sb.peers, sb.tables, null, false, sb.loader, Integer.MAX_VALUE, Long.MAX_VALUE, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));
if (theSearch.rwiProcess != null && theSearch.rwiProcess.isAlive()) try {theSearch.rwiProcess.join();} catch (final InterruptedException e) {}
if (theSearch.local_rwi_available.get() == 0) {
diff --git a/htroot/IndexCreateQueues_p.java b/htroot/IndexCreateQueues_p.java
index 75d169d12..40b832bdf 100644
--- a/htroot/IndexCreateQueues_p.java
+++ b/htroot/IndexCreateQueues_p.java
@@ -69,7 +69,10 @@ public class IndexCreateQueues_p {
entry = sb.crawler.getActive(handle);
final String name = entry.name();
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(name)) continue;
- if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
+ if (compiledPattern.matcher(name).find()) {
+ sb.crawler.removeActive(entry.handle().getBytes());
+ sb.crawler.removePassive(entry.handle().getBytes());
+ }
}
} else {
// iterating through the list of URLs
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 13f40f22b..d1bfcf3fd 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -251,7 +251,8 @@ public final class search {
false,
0.0d,
0.0d,
- 0.0d
+ 0.0d,
+ new String[0]
);
Network.log.info("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links");
@@ -315,7 +316,8 @@ public final class search {
false,
0.0d,
0.0d,
- 0.0d
+ 0.0d,
+ new String[0]
);
Network.log.info("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links");
EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()), ""));
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index cd9e38ff3..a78030bc8 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -668,7 +668,8 @@ public class yacysearch {
&& sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false)
&& sb.peers.mySeed().getFlagAcceptRemoteIndex(),
false,
- lat, lon, rad);
+ lat, lon, rad,
+ sb.getConfig("search_navigation","").split(","));
EventTracker.delete(EventTracker.EClass.SEARCH);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
theQuery.id(true),
diff --git a/htroot/yacysearch_location.java b/htroot/yacysearch_location.java
index 7a55b606d..e1a47b05e 100644
--- a/htroot/yacysearch_location.java
+++ b/htroot/yacysearch_location.java
@@ -26,6 +26,7 @@ import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.federate.opensearch.SRURSSConnector;
import net.yacy.cora.geo.GeoLocation;
+import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@@ -93,7 +94,7 @@ public class yacysearch_location {
// get a queue of search results
final String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
final BlockingQueue results = new LinkedBlockingQueue();
- SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, lon == 0.0d && lat == 0.0d ? query : query + " /radius/" + lat + "/" + lon + "/" + radius, maximumTime, Integer.MAX_VALUE, null, false, null);
+ SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, lon == 0.0d && lat == 0.0d ? query : query + " /radius/" + lat + "/" + lon + "/" + radius, maximumTime, Integer.MAX_VALUE, null, false, ClientIdentification.yacyInternetCrawlerAgent);
// take the results and compute some locations
RSSMessage message;
diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java
index ac576f339..05bdfb15d 100644
--- a/htroot/yacysearchtrailer.java
+++ b/htroot/yacysearchtrailer.java
@@ -387,7 +387,8 @@ public class yacysearchtrailer {
// category: location search
// show only if there is a location database present and if there had been any search results
- if (LibraryProvider.geoLoc.isEmpty() || theSearch.getResultCount() == 0) {
+ if ((LibraryProvider.geoLoc.isEmpty() || theSearch.getResultCount() == 0) &&
+ (theSearch.locationNavigator == null || theSearch.locationNavigator.isEmpty())) {
prop.put("cat-location", 0);
} else {
prop.put("cat-location", 1);
diff --git a/lib/commons-logging-1.1.1.jar b/lib/commons-logging-1.1.1.jar
deleted file mode 100644
index 8758a96b7..000000000
Binary files a/lib/commons-logging-1.1.1.jar and /dev/null differ
diff --git a/lib/httpclient-4.2.5.License b/lib/commons-logging-1.1.3.License
similarity index 100%
rename from lib/httpclient-4.2.5.License
rename to lib/commons-logging-1.1.3.License
diff --git a/lib/commons-logging-1.1.3.jar b/lib/commons-logging-1.1.3.jar
new file mode 100644
index 000000000..ab5125407
Binary files /dev/null and b/lib/commons-logging-1.1.3.jar differ
diff --git a/lib/dependencies.txt b/lib/dependencies.txt
index 053d6d846..25ea9a53d 100644
--- a/lib/dependencies.txt
+++ b/lib/dependencies.txt
@@ -22,8 +22,8 @@ commons-io-2.1.jar
commons-lang-2.6.jar
geronimo-stax-api_1.0_spec-1.0.1.jar
guava-r05.jar
-httpclient-4.2.3.jar
-httpcore-4.2.3.jar
+httpclient-4.3.jar
+httpcore-4.3.jar
jcl-over-slf4j-1.6.1.jar
log4j-over-slf4j-1.6.1.jar
lucene-analyzers-3.6.0.jar
diff --git a/lib/httpclient-4.2.5.jar b/lib/httpclient-4.2.5.jar
deleted file mode 100644
index 5310588ef..000000000
Binary files a/lib/httpclient-4.2.5.jar and /dev/null differ
diff --git a/lib/httpmime-4.2.5.License b/lib/httpclient-4.3.License
similarity index 100%
rename from lib/httpmime-4.2.5.License
rename to lib/httpclient-4.3.License
diff --git a/lib/httpclient-4.3.jar b/lib/httpclient-4.3.jar
new file mode 100644
index 000000000..5c446f04d
Binary files /dev/null and b/lib/httpclient-4.3.jar differ
diff --git a/lib/httpcore-4.2.4.License b/lib/httpcore-4.2.4.License
deleted file mode 100644
index 879d3fdd9..000000000
--- a/lib/httpcore-4.2.4.License
+++ /dev/null
@@ -1,240 +0,0 @@
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
-=========================================================================
-
-This project contains annotations in the package org.apache.http.annotation
-which are derived from JCIP-ANNOTATIONS
-Copyright (c) 2005 Brian Goetz and Tim Peierls.
-See http://www.jcip.net and the Creative Commons Attribution License
-(http://creativecommons.org/licenses/by/2.5)
-Full text: http://creativecommons.org/licenses/by/2.5/legalcode
-
-License
-
-THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
-
-BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
-
-1. Definitions
-
- "Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.
- "Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered a Derivative Work for the purpose of this License.
- "Licensor" means the individual or entity that offers the Work under the terms of this License.
- "Original Author" means the individual or entity who created the Work.
- "Work" means the copyrightable work of authorship offered under the terms of this License.
- "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
-
-2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.
-
-3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:
-
- to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;
- to create and reproduce Derivative Works;
- to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;
- to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.
-
- For the avoidance of doubt, where the work is a musical composition:
- Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.
- Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights agency or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work ("cover version") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).
- Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).
-
-The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.
-
-4. Restrictions.The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
-
- You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any credit as required by clause 4(b), as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any credit as required by clause 4(b), as requested.
- If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or (ii) if the Original Author and/or Licensor designate another party or parties (e.g. a sponsor institute, publishing entity, journal) for attribution in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.
-
-5. Representations, Warranties and Disclaimer
-
-UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
-
-6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-7. Termination
-
- This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
- Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
-
-8. Miscellaneous
-
- Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
- Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
- If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
- No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
- This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
diff --git a/lib/httpcore-4.2.4.jar b/lib/httpcore-4.2.4.jar
deleted file mode 100644
index 9f45bd91c..000000000
Binary files a/lib/httpcore-4.2.4.jar and /dev/null differ
diff --git a/lib/commons-logging-1.1.1.License b/lib/httpcore-4.3.License
similarity index 88%
rename from lib/commons-logging-1.1.1.License
rename to lib/httpcore-4.3.License
index 75b52484e..d9a10c0d8 100644
--- a/lib/commons-logging-1.1.1.License
+++ b/lib/httpcore-4.3.License
@@ -1,202 +1,176 @@
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
diff --git a/lib/httpcore-4.3.jar b/lib/httpcore-4.3.jar
new file mode 100644
index 000000000..e5da4578a
Binary files /dev/null and b/lib/httpcore-4.3.jar differ
diff --git a/lib/httpmime-4.2.5.jar b/lib/httpmime-4.2.5.jar
deleted file mode 100644
index e63b24d70..000000000
Binary files a/lib/httpmime-4.2.5.jar and /dev/null differ
diff --git a/lib/httpmime-4.3.License b/lib/httpmime-4.3.License
new file mode 100644
index 000000000..d9a10c0d8
--- /dev/null
+++ b/lib/httpmime-4.3.License
@@ -0,0 +1,176 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
diff --git a/lib/httpmime-4.3.jar b/lib/httpmime-4.3.jar
new file mode 100644
index 000000000..55dd8aecf
Binary files /dev/null and b/lib/httpmime-4.3.jar differ
diff --git a/nbproject/project.xml b/nbproject/project.xml
index 9df73aceb..ca5a5d3bb 100644
--- a/nbproject/project.xml
+++ b/nbproject/project.xml
@@ -73,7 +73,7 @@
source
htroot
- lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.1.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-13.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.2.3.jar;lib/httpcore-4.2.3.jar;lib/httpmime-4.2.3.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.7.1.jar;lib/jena-2.6.4.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.17.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.2.1.jar;lib/lucene-analyzers-phonetic-4.2.1.jar;lib/lucene-core-4.2.1.jar;lib/lucene-misc-4.2.1.jar;lib/lucene-spatial-4.2.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.1.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.7.2.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.2.1.jar;lib/solr-solrj-4.2.1.jar;lib/spatial4j-0.3.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.5.jar
+ lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.3.jar;lib/fontbox-1.7.1.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-13.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.3.jar;lib/httpcore-4.3.jar;lib/httpmime-4.3.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.7.1.jar;lib/jena-2.6.4.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.17.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.2.1.jar;lib/lucene-analyzers-phonetic-4.2.1.jar;lib/lucene-core-4.2.1.jar;lib/lucene-misc-4.2.1.jar;lib/lucene-spatial-4.2.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.1.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.7.2.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.2.1.jar;lib/solr-solrj-4.2.1.jar;lib/spatial4j-0.3.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.5.jar
1.6
diff --git a/source/net/yacy/cora/document/encoding/UTF8.java b/source/net/yacy/cora/document/encoding/UTF8.java
index 1d6de94a1..06c5a8176 100644
--- a/source/net/yacy/cora/document/encoding/UTF8.java
+++ b/source/net/yacy/cora/document/encoding/UTF8.java
@@ -24,10 +24,10 @@
package net.yacy.cora.document.encoding;
-import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Comparator;
+import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.content.StringBody;
/**
@@ -45,6 +45,7 @@ public class UTF8 implements Comparator {
static {
charset = Charset.forName("UTF-8");
}
+ private final static ContentType contentType = ContentType.TEXT_PLAIN.withCharset(charset);
public static final UTF8 insensitiveUTF8Comparator = new UTF8(true);
public static final UTF8 identityUTF8Comparator = new UTF8(false);
@@ -103,12 +104,7 @@ public class UTF8 implements Comparator {
}
public final static StringBody StringBody(final String s) {
- try {
- return new StringBody(s == null ? "" : s, charset);
- } catch (final UnsupportedEncodingException e) {
- e.printStackTrace();
- return null;
- }
+ return new StringBody(s == null ? "" : s, contentType);
}
/**
diff --git a/source/net/yacy/cora/document/id/DigestURL.java b/source/net/yacy/cora/document/id/DigestURL.java
index c1bdb78ed..b45a58553 100644
--- a/source/net/yacy/cora/document/id/DigestURL.java
+++ b/source/net/yacy/cora/document/id/DigestURL.java
@@ -48,7 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
public class DigestURL extends MultiProtocolURL implements Serializable {
private static final long serialVersionUID = -1173233022912141885L;
- public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
// class variables
private byte[] hash;
diff --git a/source/net/yacy/cora/federate/solr/Ranking.java b/source/net/yacy/cora/federate/solr/Ranking.java
index dc20138e3..b61280ab0 100644
--- a/source/net/yacy/cora/federate/solr/Ranking.java
+++ b/source/net/yacy/cora/federate/solr/Ranking.java
@@ -24,6 +24,8 @@ import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
+import org.openjena.atlas.logging.Log;
+
import net.yacy.cora.util.CommonPattern;
import net.yacy.search.schema.CollectionSchema;
@@ -75,16 +77,22 @@ public class Ranking {
* @param boostDef the definition string
*/
public void updateBoosts(String boostDef) {
- // call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
+ // call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description_txt^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
if (boostDef == null || boostDef.length() == 0) return;
String[] bf = CommonPattern.COMMA.split(boostDef);
this.fieldBoosts.clear();
for (String boost: bf) {
int p = boost.indexOf('^');
if (p < 0) continue;
- CollectionSchema field = CollectionSchema.valueOf(boost.substring(0, p));
- Float factor = Float.parseFloat(boost.substring(p + 1));
- this.fieldBoosts.put(field, factor);
+ String boostkey = boost.substring(0, p);
+ try {
+ CollectionSchema field = CollectionSchema.valueOf(boostkey);
+ Float factor = Float.parseFloat(boost.substring(p + 1));
+ this.fieldBoosts.put(field, factor);
+ } catch (IllegalArgumentException e) {
+ // boostkey is unknown; ignore it but print warning
+ Log.warn("Ranking", "unknwon boost key '" + boostkey + "'");
+ }
}
}
diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
index d976ae515..d1ad4301d 100644
--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@@ -33,6 +33,7 @@ import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
@@ -78,6 +79,34 @@ public class SchemaConfiguration extends Configuration implements Serializable {
}
}
+ /**
+ * Convert a SolrDocument to a SolrInputDocument.
+ * This is useful if a document from the search index shall be modified and indexed again.
+ * This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
+ * which are created automatically during the indexing process.
+ * @param doc the solr document
+ * @return a solr input document
+ */
+ public SolrInputDocument toSolrInputDocument(final SolrDocument doc, Set omitFields) {
+ SolrInputDocument sid = new SolrInputDocument();
+ for (String name: doc.getFieldNames()) {
+ if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema
+ sid.addField(name, doc.getFieldValue(name), 1.0f);
+ }
+ }
+ return sid;
+ }
+
+ public SolrDocument toSolrDocument(final SolrInputDocument doc, Set omitFields) {
+ SolrDocument sd = new SolrDocument();
+ for (SolrInputField field: doc) {
+ if (this.contains(field.getName()) && (omitFields == null || !omitFields.contains(field.getName()))) { // check each field if enabled in local Solr schema
+ sd.setField(field.getName(), field.getValue());
+ }
+ }
+ return sd;
+ }
+
public boolean postprocessing_doublecontent(Segment segment, Set uniqueURLs, SolrInputDocument sid, DigestURL url) {
boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
diff --git a/source/net/yacy/cora/federate/solr/instance/RemoteInstance.java b/source/net/yacy/cora/federate/solr/instance/RemoteInstance.java
index f9fa62cef..9d09b9242 100644
--- a/source/net/yacy/cora/federate/solr/instance/RemoteInstance.java
+++ b/source/net/yacy/cora/federate/solr/instance/RemoteInstance.java
@@ -46,24 +46,17 @@ import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.entity.GzipDecompressingEntity;
-import org.apache.http.client.protocol.ClientContext;
import org.apache.http.impl.auth.BasicScheme;
-import org.apache.http.impl.client.BasicAuthCache;
-import org.apache.http.impl.client.BasicCredentialsProvider;
-import org.apache.http.impl.client.DefaultHttpClient;
-import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
-import org.apache.http.impl.conn.PoolingClientConnectionManager;
-import org.apache.http.params.HttpConnectionParams;
-import org.apache.http.params.HttpParams;
import org.apache.http.protocol.HttpContext;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
+@SuppressWarnings("deprecation") //TODO: switch to 4.3-Stuff
public class RemoteInstance implements SolrInstance {
private String solrurl;
- private final DefaultHttpClient client;
-
+ private final org.apache.http.impl.client.DefaultHttpClient client;
+// 4.3 private final CloseableHttpClient client;
private final String defaultCoreName;
private final HttpSolrServer defaultServer;
private final Collection coreNames;
@@ -133,25 +126,73 @@ public class RemoteInstance implements SolrInstance {
}
}
if (solraccount.length() > 0) {
- PoolingClientConnectionManager cm = new PoolingClientConnectionManager(); // try also: ThreadSafeClientConnManager
+// 4.3:
+// final PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
+// cm.setMaxTotal(100);
+//
+// final RequestConfig.Builder reqBuilder = RequestConfig.custom();
+// reqBuilder.setSocketTimeout(timeout);
+// reqBuilder.setConnectTimeout(timeout);
+// reqBuilder.setConnectionRequestTimeout(timeout);
+//
+// final BasicCredentialsProvider credsProvider = new BasicCredentialsProvider();
+// credsProvider.setCredentials(new AuthScope(host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(solraccount, solrpw));
+//
+// final HttpClientBuilder builder = HttpClientBuilder.create();
+// builder.setConnectionManager(cm);
+// builder.setDefaultRequestConfig(reqBuilder.build());
+// builder.setDefaultCredentialsProvider(credsProvider);
+// builder.disableAutomaticRetries(); // no retries needed; we expect connections to fail; therefore we should not retry
+// // ask for gzip - why not using net.yacy.cora.protocol.http.GzipRequestInterceptor?
+// builder.addInterceptorLast(new HttpRequestInterceptor() {
+// @Override
+// public void process(final HttpRequest request, final HttpContext context) throws IOException {
+// if (!request.containsHeader("Accept-Encoding")) request.addHeader("Accept-Encoding", "gzip");
+// if (!request.containsHeader("Connection")) request.addHeader("Connection", "close"); // prevent CLOSE_WAIT
+// }
+//
+// });
+// // uncompress gzip - why not using net.yacy.cora.protocol.http.GzipResponseInterceptor?
+// builder.addInterceptorLast(new HttpResponseInterceptor() {
+// @Override
+// public void process(final HttpResponse response, final HttpContext context) throws IOException {
+// HttpEntity entity = response.getEntity();
+// if (entity != null) {
+// Header ceheader = entity.getContentEncoding();
+// if (ceheader != null) {
+// HeaderElement[] codecs = ceheader.getElements();
+// for (HeaderElement codec : codecs) {
+// if (codec.getName().equalsIgnoreCase("gzip")) {
+// response.setEntity(new GzipDecompressingEntity(response.getEntity()));
+// return;
+// }
+// }
+// }
+// }
+// }
+// });
+// this.client = builder.build();
+
+// old Stuff START
+ org.apache.http.impl.conn.PoolingClientConnectionManager cm = new org.apache.http.impl.conn.PoolingClientConnectionManager(); // try also: ThreadSafeClientConnManager
cm.setMaxTotal(100);
- this.client = new DefaultHttpClient(cm) {
+ this.client = new org.apache.http.impl.client.DefaultHttpClient(cm) {
@Override
protected HttpContext createHttpContext() {
HttpContext context = super.createHttpContext();
- AuthCache authCache = new BasicAuthCache();
+ AuthCache authCache = new org.apache.http.impl.client.BasicAuthCache();
BasicScheme basicAuth = new BasicScheme();
HttpHost targetHost = new HttpHost(u.getHost(), u.getPort(), u.getProtocol());
authCache.put(targetHost, basicAuth);
- context.setAttribute(ClientContext.AUTH_CACHE, authCache);
- this.setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false)); // no retries needed; we expect connections to fail; therefore we should not retry
+ context.setAttribute(org.apache.http.client.protocol.ClientContext.AUTH_CACHE, authCache);
+ this.setHttpRequestRetryHandler(new org.apache.http.impl.client.DefaultHttpRequestRetryHandler(0, false)); // no retries needed; we expect connections to fail; therefore we should not retry
return context;
}
};
- HttpParams params = this.client.getParams();
- HttpConnectionParams.setConnectionTimeout(params, timeout);
- HttpConnectionParams.setSoTimeout(params, timeout);
+ org.apache.http.params.HttpParams params = this.client.getParams();
+ org.apache.http.params.HttpConnectionParams.setConnectionTimeout(params, timeout);
+ org.apache.http.params.HttpConnectionParams.setSoTimeout(params, timeout);
this.client.addRequestInterceptor(new HttpRequestInterceptor() {
@Override
public void process(final HttpRequest request, final HttpContext context) throws IOException {
@@ -178,9 +219,10 @@ public class RemoteInstance implements SolrInstance {
}
}
});
- BasicCredentialsProvider credsProvider = new BasicCredentialsProvider();
+ org.apache.http.impl.client.BasicCredentialsProvider credsProvider = new org.apache.http.impl.client.BasicCredentialsProvider();
credsProvider.setCredentials(new AuthScope(host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(solraccount, solrpw));
this.client.setCredentialsProvider(credsProvider);
+// old Stuff END
} else {
this.client = null;
}
@@ -248,7 +290,14 @@ public class RemoteInstance implements SolrInstance {
@Override
public void close() {
- if (this.client != null) this.client.getConnectionManager().shutdown();
+ if (this.client != null) this.client.getConnectionManager().shutdown();
+// 4.3
+// if (this.client != null)
+// try {
+// this.client.close();
+// } catch (final IOException e) {
+// // TODO Auto-generated catch block
+// }
}
}
diff --git a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
index 8d1789e5f..35fa9b768 100644
--- a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
+++ b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
@@ -110,7 +110,7 @@ public class AutotaggingLibrary {
}
public void addPlaces(Locations locations) {
- if (locations.isEmpty()) return; // otherwise we get a navigation that does nothing
+ if (locations.isEmpty()) return; // otherwise we get a navigation that does nothing
Tagging voc = new Tagging("Locations", locations);
try {
voc.setObjectspace("http://dbpedia.org/resource/");
@@ -122,6 +122,10 @@ public class AutotaggingLibrary {
}
}
+ public void removePlaces() {
+ this.vocabularies.remove("Locations");
+ }
+
public int size() {
return this.vocabularies.size();
}
diff --git a/source/net/yacy/cora/protocol/ByteArrayBody.java b/source/net/yacy/cora/protocol/ByteArrayBody.java
index 352cfffed..132b20f41 100644
--- a/source/net/yacy/cora/protocol/ByteArrayBody.java
+++ b/source/net/yacy/cora/protocol/ByteArrayBody.java
@@ -28,6 +28,7 @@ package net.yacy.cora.protocol;
import java.io.IOException;
import java.io.OutputStream;
+import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.MIME;
import org.apache.http.entity.mime.content.AbstractContentBody;
@@ -42,7 +43,7 @@ public class ByteArrayBody extends AbstractContentBody {
* @param filename
*/
public ByteArrayBody(final byte[] bytes, final String filename) {
- super("application/octet-stream");
+ super(ContentType.APPLICATION_OCTET_STREAM);
this.bytes = bytes;
this.filename = filename;
}
diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java
index c1c6b6d18..70a960f5b 100644
--- a/source/net/yacy/cora/protocol/Domains.java
+++ b/source/net/yacy/cora/protocol/Domains.java
@@ -74,8 +74,8 @@ public class Domains {
private static final String PRESENT = "";
private static final Pattern LOCAL_PATTERNS = Pattern.compile("(10\\..*)|(127\\..*)|(172\\.(1[6-9]|2[0-9]|3[0-1])\\..*)|(169\\.254\\..*)|(192\\.168\\..*)|(localhost)|(\\[?\\:\\:1/.*)|(\\[?fc.*)|(\\[?fd.*)|(\\[?(fe80|0)\\:0\\:0\\:0\\:0\\:0\\:0\\:1.*)");
- private static final int MAX_NAME_CACHE_HIT_SIZE = 100000;
- private static final int MAX_NAME_CACHE_MISS_SIZE = 100000;
+ private static final int MAX_NAME_CACHE_HIT_SIZE = 10000;
+ private static final int MAX_NAME_CACHE_MISS_SIZE = 1000;
private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() * 2;
// a dns cache
@@ -782,7 +782,7 @@ public class Domains {
public InetAddress call() throws Exception {
return InetAddress.getByName(host);
}
- }, 1000L, TimeUnit.MILLISECONDS, false);
+ }, 3000L, TimeUnit.MILLISECONDS, false);
//ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
}
//.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms");
diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java
index 71b2591a4..d34dcb7be 100644
--- a/source/net/yacy/cora/protocol/http/HTTPClient.java
+++ b/source/net/yacy/cora/protocol/http/HTTPClient.java
@@ -28,12 +28,12 @@ package net.yacy.cora.protocol.http;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
-import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@@ -49,7 +49,6 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
-import net.yacy.cora.protocol.http.ProxySettings.Protocol;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
@@ -58,41 +57,32 @@ import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
-import org.apache.http.HttpVersion;
-import org.apache.http.auth.AuthScope;
-import org.apache.http.auth.UsernamePasswordCredentials;
-import org.apache.http.client.CredentialsProvider;
-import org.apache.http.client.HttpClient;
+import org.apache.http.client.config.CookieSpecs;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
-import org.apache.http.client.params.CookiePolicy;
-import org.apache.http.client.params.HttpClientParams;
-import org.apache.http.client.protocol.ClientContext;
-import org.apache.http.conn.ClientConnectionManager;
+import org.apache.http.client.protocol.HttpClientContext;
+import org.apache.http.config.Registry;
+import org.apache.http.config.RegistryBuilder;
+import org.apache.http.config.SocketConfig;
import org.apache.http.conn.ConnectionKeepAliveStrategy;
-import org.apache.http.conn.params.ConnRouteParams;
+import org.apache.http.conn.DnsResolver;
+import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.conn.routing.HttpRoute;
-import org.apache.http.conn.scheme.PlainSocketFactory;
-import org.apache.http.conn.scheme.Scheme;
-import org.apache.http.conn.scheme.SchemeRegistry;
-import org.apache.http.conn.ssl.SSLSocketFactory;
+import org.apache.http.conn.socket.ConnectionSocketFactory;
+import org.apache.http.conn.socket.PlainConnectionSocketFactory;
+import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.entity.InputStreamEntity;
-import org.apache.http.entity.mime.MultipartEntity;
+import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.ContentBody;
-import org.apache.http.entity.mime.content.StringBody;
-import org.apache.http.impl.client.BasicCredentialsProvider;
-import org.apache.http.impl.client.DefaultHttpClient;
-import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
-import org.apache.http.impl.conn.PoolingClientConnectionManager;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicHeaderElementIterator;
-import org.apache.http.params.BasicHttpParams;
-import org.apache.http.params.HttpConnectionParams;
-import org.apache.http.params.HttpParams;
-import org.apache.http.params.HttpProtocolParams;
-import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.ByteArrayBuffer;
@@ -108,106 +98,126 @@ import org.apache.http.util.EntityUtils;
public class HTTPClient {
private final static int maxcon = 200;
- private static IdledConnectionEvictor idledConnectionEvictor = null;
- private static HttpClient httpClient = initConnectionManager();
- private static final CredentialsProvider credsProvider = new BasicCredentialsProvider();
+ private static IdleConnectionMonitorThread connectionMonitor = null;
+ private final static RequestConfig dfltReqConf = initRequestConfig();
+ private final static HttpClientBuilder clientBuilder = initClientBuilder();
+ private final RequestConfig.Builder reqConfBuilder;
private Set> headers = null;
- private HttpResponse httpResponse = null;
+ private CloseableHttpResponse httpResponse = null;
private HttpUriRequest currentRequest = null;
private long upbytes = 0L;
- private int timeout = 10000;
- private ClientIdentification.Agent agent = null;
private String host = null;
- private boolean redirecting = true;
private String realm = null;
public HTTPClient(final ClientIdentification.Agent agent) {
super();
- this.agent = agent;
- this.timeout = agent.clientTimeout;
- HttpProtocolParams.setUserAgent(httpClient.getParams(), agent.userAgent);
+ clientBuilder.setUserAgent(agent.userAgent);
+ reqConfBuilder = RequestConfig.copy(dfltReqConf);
+ reqConfBuilder.setSocketTimeout(agent.clientTimeout);
+ reqConfBuilder.setConnectTimeout(agent.clientTimeout);
+ reqConfBuilder.setConnectionRequestTimeout(agent.clientTimeout);
}
public HTTPClient(final ClientIdentification.Agent agent, final int timeout) {
super();
- this.agent = agent;
- this.timeout = timeout;
- HttpProtocolParams.setUserAgent(httpClient.getParams(), agent.userAgent);
+ clientBuilder.setUserAgent(agent.userAgent);
+ reqConfBuilder = RequestConfig.copy(dfltReqConf);
+ reqConfBuilder.setSocketTimeout(timeout);
+ reqConfBuilder.setConnectTimeout(timeout);
+ reqConfBuilder.setConnectionRequestTimeout(timeout);
}
public static void setDefaultUserAgent(final String defaultAgent) {
- HttpProtocolParams.setUserAgent(httpClient.getParams(), defaultAgent);
+ clientBuilder.setUserAgent(defaultAgent);
}
-
- public static HttpClient initConnectionManager() {
- // Create and initialize scheme registry
- final SchemeRegistry schemeRegistry = new SchemeRegistry();
- schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
- schemeRegistry.register(new Scheme("https", 443, getSSLSocketFactory()));
-
- final PoolingClientConnectionManager clientConnectionManager = new PoolingClientConnectionManager(schemeRegistry);
-
- // Create and initialize HTTP parameters
- final HttpParams httpParams = new BasicHttpParams();
- /**
- * ConnectionManager settings
- */
- // how much connections do we need? - default: 20
- clientConnectionManager.setMaxTotal(maxcon);
- // for statistics same value should also be set here
- ConnectionInfo.setMaxcount(maxcon);
- // connections per host (2 default)
- clientConnectionManager.setDefaultMaxPerRoute(2);
- // Increase max connections for localhost
- final HttpHost localhost = new HttpHost(Domains.LOCALHOST);
- clientConnectionManager.setMaxPerRoute(new HttpRoute(localhost), maxcon);
- /**
- * HTTP protocol settings
- */
- HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1);
- // UserAgent
- HttpProtocolParams.setUserAgent(httpParams, ClientIdentification.yacyInternetCrawlerAgent.userAgent);
- HttpProtocolParams.setUseExpectContinue(httpParams, false); // IMPORTANT - if not set to 'false' then servers do not process the request until a time-out of 2 seconds
- /**
- * HTTP connection settings
- */
+
+ private static RequestConfig initRequestConfig() {
+ final RequestConfig.Builder builder = RequestConfig.custom();
+ // IMPORTANT - if not set to 'false' then servers do not process the request until a time-out of 2 seconds
+ builder.setExpectContinueEnabled(false);
// timeout in milliseconds until a connection is established in milliseconds
- HttpConnectionParams.setConnectionTimeout(httpParams, 6000);
- // SO_LINGER affects the socket close operation in seconds
- // HttpConnectionParams.setLinger(httpParams, 6);
- // HttpConnectionParams.setSocketBufferSize(httpParams, 8192);
+ builder.setConnectionRequestTimeout(6000);
+ builder.setConnectTimeout(8000);
// SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds
- HttpConnectionParams.setSoTimeout(httpParams, 1000);
+ builder.setSocketTimeout(3000);
// getting an I/O error when executing a request over a connection that has been closed at the server side
- HttpConnectionParams.setStaleCheckingEnabled(httpParams, true);
- // conserve bandwidth by minimizing the number of segments that are sent
- HttpConnectionParams.setTcpNoDelay(httpParams, false);
- // Defines whether the socket can be bound even though a previous connection is still in a timeout state.
- HttpConnectionParams.setSoReuseaddr(httpParams, true);
-
- /**
- * HTTP client settings
- */
+ builder.setStaleConnectionCheckEnabled(true);
// ignore cookies, cause this may cause segfaults in default cookiestore and is not needed
- HttpClientParams.setCookiePolicy(httpParams, CookiePolicy.IGNORE_COOKIES);
-
- httpClient = new DefaultHttpClient(clientConnectionManager, httpParams);
+ builder.setCookieSpec(CookieSpecs.IGNORE_COOKIES);
+ builder.setRedirectsEnabled(true);
+ builder.setRelativeRedirectsAllowed(true);
+ return builder.build();
+ }
+
+ private static HttpClientBuilder initClientBuilder() {
+ final HttpClientBuilder builder = HttpClientBuilder.create();
+
+ builder.setConnectionManager(initPoolingConnectionManager());
+ builder.setDefaultRequestConfig(dfltReqConf);
+
+ // UserAgent
+ builder.setUserAgent(ClientIdentification.yacyInternetCrawlerAgent.userAgent);
+
+ // remove retries; we expect connections to fail; therefore we should not retry
+ builder.disableAutomaticRetries();
// disable the cookiestore, cause this may cause segfaults and is not needed
- ((DefaultHttpClient) httpClient).setCookieStore(null);
+ builder.setDefaultCookieStore(null);
+ builder.disableCookieManagement();
+
// add cutom keep alive strategy
- addCustomKeepAliveStrategy((DefaultHttpClient) httpClient);
+ builder.setKeepAliveStrategy(customKeepAliveStrategy());
+
// ask for gzip
- ((DefaultHttpClient) httpClient).addRequestInterceptor(new GzipRequestInterceptor());
+ builder.addInterceptorLast(new GzipRequestInterceptor());
// uncompress gzip
- ((DefaultHttpClient) httpClient).addResponseInterceptor(new GzipResponseInterceptor());
- // remove retries; we expect connections to fail; therefore we should not retry
- ((DefaultHttpClient) httpClient).setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false));
- if (idledConnectionEvictor == null) {
- idledConnectionEvictor = new IdledConnectionEvictor(clientConnectionManager);
- idledConnectionEvictor.start();
+ builder.addInterceptorLast(new GzipResponseInterceptor());
+ // Proxy
+ builder.setRoutePlanner(ProxySettings.RoutePlanner);
+ builder.setDefaultCredentialsProvider(ProxySettings.CredsProvider);
+
+ return builder;
+ }
+
+ private static PoolingHttpClientConnectionManager initPoolingConnectionManager() {
+ final PlainConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
+ final Registry registry = RegistryBuilder.create()
+ .register("http", plainsf)
+ .register("https", getSSLSocketFactory())
+ .build();
+ final PoolingHttpClientConnectionManager pooling = new PoolingHttpClientConnectionManager(registry, new DnsResolver(){
+ @Override
+ public InetAddress[] resolve(final String host0)throws UnknownHostException {
+ final InetAddress ip = Domains.dnsResolve(host0);
+ if (ip == null) throw new UnknownHostException(host0);
+ return new InetAddress[]{ip};
+ }});
+ // how much connections do we need? - default: 20
+ pooling.setMaxTotal(maxcon);
+ // for statistics same value should also be set here
+ ConnectionInfo.setMaxcount(maxcon);
+ // connections per host (2 default)
+ pooling.setDefaultMaxPerRoute(4);
+ // Increase max connections for localhost
+ final HttpHost localhost = new HttpHost(Domains.LOCALHOST);
+ pooling.setMaxPerRoute(new HttpRoute(localhost), maxcon);
+
+ final SocketConfig socketConfig = SocketConfig.custom()
+ // Defines whether the socket can be bound even though a previous connection is still in a timeout state.
+ .setSoReuseAddress(true)
+ // SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds
+ .setSoTimeout(3000)
+ // conserve bandwidth by minimizing the number of segments that are sent
+ .setTcpNoDelay(false)
+ .build();
+ pooling.setDefaultSocketConfig(socketConfig);
+
+ if (connectionMonitor == null) {
+ connectionMonitor = new IdleConnectionMonitorThread(pooling);
+ connectionMonitor.start();
}
- return httpClient;
+
+ return pooling;
}
/**
@@ -217,34 +227,29 @@ public class HTTPClient {
* @throws InterruptedException
*/
public static void closeConnectionManager() throws InterruptedException {
- if (idledConnectionEvictor != null) {
+ if (connectionMonitor != null) {
// Shut down the evictor thread
- idledConnectionEvictor.shutdown();
- idledConnectionEvictor.join();
+ connectionMonitor.shutdown();
+ connectionMonitor.join();
}
- if (httpClient != null) {
- // Shut down the connection manager
- httpClient.getConnectionManager().shutdown();
- }
-
}
- public static void setAuth(final String host, final int port, final String user, final String pw) {
- final UsernamePasswordCredentials creds = new UsernamePasswordCredentials(user, pw);
- final AuthScope scope = new AuthScope(host, port);
- credsProvider.setCredentials(scope, creds);
- httpClient.getParams().setParameter(ClientContext.CREDS_PROVIDER, credsProvider);
- }
+// public static void setAuth(final String host, final int port, final String user, final String pw) {
+// final UsernamePasswordCredentials creds = new UsernamePasswordCredentials(user, pw);
+// final AuthScope scope = new AuthScope(host, port);
+// credsProvider.setCredentials(scope, creds);
+// httpClient.getParams().setParameter(ClientContext.CREDS_PROVIDER, credsProvider);
+// }
- /**
- * this method sets a host on which more than the default of 2 router per host are allowed
- *
- * @param the host to be raised in 'route per host'
- */
- public static void setMaxRouteHost(final String host) {
- final HttpHost mHost = new HttpHost(host);
- ((PoolingClientConnectionManager) httpClient.getConnectionManager()).setMaxPerRoute(new HttpRoute(mHost), 50);
- }
+// /**
+// * this method sets a host on which more than the default of 2 router per host are allowed
+// *
+// * @param the host to be raised in 'route per host'
+// */
+// public static void setMaxRouteHost(final String host) {
+// final HttpHost mHost = new HttpHost(host);
+// ((PoolingClientConnectionManager) httpClient.getConnectionManager()).setMaxPerRoute(new HttpRoute(mHost), 50);
+// }
/**
* This method sets the Header used for the request
@@ -261,7 +266,9 @@ public class HTTPClient {
* @param timeout in milliseconds
*/
public void setTimout(final int timeout) {
- this.timeout = timeout;
+ reqConfBuilder.setSocketTimeout(timeout);
+ reqConfBuilder.setConnectTimeout(timeout);
+ reqConfBuilder.setConnectionRequestTimeout(timeout);
}
/**
@@ -270,7 +277,7 @@ public class HTTPClient {
* @param userAgent
*/
public void setUserAgent(final ClientIdentification.Agent agent) {
- this.agent = agent;
+ clientBuilder.setUserAgent(agent.userAgent);
}
/**
@@ -288,7 +295,8 @@ public class HTTPClient {
* @param redirecting
*/
public void setRedirecting(final boolean redirecting) {
- this.redirecting = redirecting;
+ reqConfBuilder.setRedirectsEnabled(redirecting);
+ reqConfBuilder.setRelativeRedirectsAllowed(redirecting);
}
/**
@@ -354,7 +362,7 @@ public class HTTPClient {
}
httpGet.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
if (!localhost) setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
- return getContentBytes(httpGet, url.getHost(), maxBytes);
+ return getContentBytes(httpGet, maxBytes);
}
/**
@@ -378,7 +386,7 @@ public class HTTPClient {
httpGet.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
this.currentRequest = httpGet;
- execute(httpGet, url.getHost());
+ execute(httpGet);
}
/**
@@ -393,7 +401,7 @@ public class HTTPClient {
final HttpHead httpHead = new HttpHead(url.toNormalform(true));
httpHead.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
- execute(httpHead, url.getHost());
+ execute(httpHead);
finish();
ConnectionInfo.removeConnection(httpHead.hashCode());
return this.httpResponse;
@@ -422,7 +430,7 @@ public class HTTPClient {
this.upbytes = length;
httpPost.setEntity(inputStreamEntity);
this.currentRequest = httpPost;
- execute(httpPost, host);
+ execute(httpPost);
}
/**
@@ -454,10 +462,11 @@ public class HTTPClient {
setHost(vhost); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
if (vhost == null) setHost(Domains.LOCALHOST);
-
- final MultipartEntity multipartEntity = new MultipartEntity();
- for (final Entry part : post.entrySet())
- multipartEntity.addPart(part.getKey(), part.getValue());
+
+ final MultipartEntityBuilder entityBuilder = MultipartEntityBuilder.create();
+ for (final Entry part : post.entrySet())
+ entityBuilder.addPart(part.getKey(), part.getValue());
+ final HttpEntity multipartEntity = entityBuilder.build();
// statistics
this.upbytes = multipartEntity.getContentLength();
@@ -467,7 +476,7 @@ public class HTTPClient {
httpPost.setEntity(multipartEntity);
}
- return getContentBytes(httpPost, url.getHost(), Integer.MAX_VALUE);
+ return getContentBytes(httpPost, Integer.MAX_VALUE);
}
/**
@@ -491,7 +500,7 @@ public class HTTPClient {
// statistics
this.upbytes = length;
httpPost.setEntity(inputStreamEntity);
- return getContentBytes(httpPost, host, Integer.MAX_VALUE);
+ return getContentBytes(httpPost, Integer.MAX_VALUE);
}
/**
@@ -580,9 +589,9 @@ public class HTTPClient {
}
}
- private byte[] getContentBytes(final HttpUriRequest httpUriRequest, String host, final int maxBytes) throws IOException {
+ private byte[] getContentBytes(final HttpUriRequest httpUriRequest, final int maxBytes) throws IOException {
try {
- execute(httpUriRequest, host);
+ execute(httpUriRequest);
if (this.httpResponse == null) return null;
// get the response body
final HttpEntity httpEntity = this.httpResponse.getEntity();
@@ -602,11 +611,13 @@ public class HTTPClient {
}
}
- private void execute(final HttpUriRequest httpUriRequest, String host) throws IOException {
- final HttpContext httpContext = new BasicHttpContext();
+ private void execute(final HttpUriRequest httpUriRequest) throws IOException {
+ final HttpClientContext context = HttpClientContext.create();
+ context.setRequestConfig(reqConfBuilder.build());
+ if (this.host != null)
+ context.setTargetHost(new HttpHost(this.host));
+
setHeaders(httpUriRequest);
- setParams(httpUriRequest.getParams());
- setProxy(httpUriRequest.getParams(), host);
// statistics
storeConnectionInfo(httpUriRequest);
// execute the method; some asserts confirm that that the request can be send with Content-Length and is therefore not terminated by EOF
@@ -620,14 +631,17 @@ public class HTTPClient {
}
Thread.currentThread().setName("HTTPClient-" + httpUriRequest.getURI().getHost());
+ final long time = System.currentTimeMillis();
try {
- final long time = System.currentTimeMillis();
- this.httpResponse = httpClient.execute(httpUriRequest, httpContext);
+ final CloseableHttpClient client = clientBuilder.build();
+ this.httpResponse = client.execute(httpUriRequest, context);
this.httpResponse.setHeader(HeaderFramework.RESPONSE_TIME_MILLIS, Long.toString(System.currentTimeMillis() - time));
} catch (final IOException e) {
ConnectionInfo.removeConnection(httpUriRequest.hashCode());
httpUriRequest.abort();
- throw new IOException("Client can't execute: " + (e.getCause() == null ? e.getMessage() : e.getCause().getMessage()));
+ throw new IOException("Client can't execute: "
+ + (e.getCause() == null ? e.getMessage() : e.getCause().getMessage())
+ + " duration=" + Long.toString(System.currentTimeMillis() - time));
}
}
@@ -669,23 +683,6 @@ public class HTTPClient {
httpUriRequest.setHeader("Authorization", "realm=" + this.realm);
}
- private void setParams(final HttpParams httpParams) {
- HttpClientParams.setRedirecting(httpParams, this.redirecting);
- HttpConnectionParams.setConnectionTimeout(httpParams, this.timeout);
- HttpConnectionParams.setSoTimeout(httpParams, this.timeout);
- if (this.agent != null)
- HttpProtocolParams.setUserAgent(httpParams, this.agent.userAgent);
- if (this.host != null)
- httpParams.setParameter(HTTP.TARGET_HOST, this.host);
- }
-
- private static void setProxy(final HttpParams httpParams, String host) {
- if (ProxySettings.useForHost(host, Protocol.HTTP))
- ConnRouteParams.setDefaultProxy(httpParams, ProxySettings.getProxyHost());
- // TODO find a better way for this
- ProxySettings.setProxyCreds((DefaultHttpClient) httpClient);
- }
-
private void storeConnectionInfo(final HttpUriRequest httpUriRequest) {
final int port = httpUriRequest.getURI().getPort();
final String thost = httpUriRequest.getURI().getHost();
@@ -699,7 +696,7 @@ public class HTTPClient {
this.upbytes));
}
- private static SSLSocketFactory getSSLSocketFactory() {
+ private static SSLConnectionSocketFactory getSSLSocketFactory() {
final TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(final X509Certificate[] chain, final String authType)
@@ -728,7 +725,9 @@ public class HTTPClient {
// e.printStackTrace();
}
- final SSLSocketFactory sslSF = new SSLSocketFactory(sslContext, SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
+ final SSLConnectionSocketFactory sslSF = new SSLConnectionSocketFactory(
+ sslContext,
+ SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
return sslSF;
}
@@ -739,8 +738,8 @@ public class HTTPClient {
*
* @param defaultHttpClient
*/
- private static void addCustomKeepAliveStrategy(final DefaultHttpClient defaultHttpClient) {
- defaultHttpClient.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
+ private static ConnectionKeepAliveStrategy customKeepAliveStrategy() {
+ return new ConnectionKeepAliveStrategy() {
@Override
public long getKeepAliveDuration(HttpResponse response, HttpContext context) {
// Honor 'keep-alive' header
@@ -762,7 +761,7 @@ public class HTTPClient {
// Keep alive for 5 seconds only
return 5 * 1000;
}
- });
+ };
}
/**
@@ -773,13 +772,13 @@ public class HTTPClient {
public static void main(final String[] args) {
String url = null;
// prepare Parts
- final Map newparts = new LinkedHashMap();
- try {
- newparts.put("foo", new StringBody("FooBar"));
- newparts.put("bar", new StringBody("BarFoo"));
- } catch (final UnsupportedEncodingException e) {
- System.out.println(e.getStackTrace());
- }
+// final Map newparts = new LinkedHashMap();
+// try {
+// newparts.put("foo", new StringBody("FooBar"));
+// newparts.put("bar", new StringBody("BarFoo"));
+// } catch (final UnsupportedEncodingException e) {
+// System.out.println(e.getStackTrace());
+// }
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
client.setRedirecting(false);
// Get some
@@ -805,7 +804,7 @@ public class HTTPClient {
// for (HeaderElement element: header.getElements())
// System.out.println("Element " + element.getName() + " : " + element.getValue());
}
- System.out.println(client.getHttpResponse().getLocale());
+// System.out.println(client.getHttpResponse().getLocale());
System.out.println(client.getHttpResponse().getProtocolVersion());
System.out.println(client.getHttpResponse().getStatusLine());
// Post some
@@ -822,49 +821,41 @@ public class HTTPClient {
}
}
+ public static class IdleConnectionMonitorThread extends Thread {
+
+ private final HttpClientConnectionManager connMgr;
+ private volatile boolean shutdown;
+
+ public IdleConnectionMonitorThread(HttpClientConnectionManager connMgr) {
+ super();
+ this.connMgr = connMgr;
+ }
- /**
- *
- * @see: http://hc.apache.org/httpcomponents-client-4.0.1/tutorial/html/connmgmt.html#d4e638
- *
- */
- private static class IdledConnectionEvictor extends Thread {
-
- private final ClientConnectionManager clientConnectionManager;
-
- private volatile boolean shutdown;
-
- public IdledConnectionEvictor(final ClientConnectionManager clientConnectionManager) {
- super();
- this.clientConnectionManager = clientConnectionManager;
- }
-
- @Override
- public void run() {
- try {
- while (!this.shutdown) {
- synchronized (this) {
- wait(5000);
- // Close expired connections
- this.clientConnectionManager.closeExpiredConnections();
- // Optionally, close connections
- // that have been idle longer than 5 sec
- // (some SOHO router act strange on >5sec idled connections)
- this.clientConnectionManager.closeIdleConnections(5, TimeUnit.SECONDS);
- }
- }
- } catch (final InterruptedException ex) {
- // terminate
- }
- }
-
- public void shutdown() {
- this.shutdown = true;
- synchronized (this) {
- notifyAll();
- }
- }
-
+ @Override
+ public void run() {
+ try {
+ while (!shutdown) {
+ synchronized (this) {
+ wait(5000);
+ // Close expired connections
+ connMgr.closeExpiredConnections();
+ // Optionally, close connections
+ // that have been idle longer than 30 sec
+ connMgr.closeIdleConnections(30, TimeUnit.SECONDS);
+ }
+ }
+ connMgr.shutdown();
+ } catch (final InterruptedException ex) {
+ // terminate
+ }
+ }
+
+ public void shutdown() {
+ shutdown = true;
+ synchronized (this) {
+ notifyAll();
+ }
+ }
}
}
diff --git a/source/net/yacy/cora/protocol/http/ProxySettings.java b/source/net/yacy/cora/protocol/http/ProxySettings.java
index 6bb11623d..b5aec00c5 100644
--- a/source/net/yacy/cora/protocol/http/ProxySettings.java
+++ b/source/net/yacy/cora/protocol/http/ProxySettings.java
@@ -27,10 +27,16 @@ package net.yacy.cora.protocol.http;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
+import org.apache.http.HttpException;
import org.apache.http.HttpHost;
+import org.apache.http.HttpRequest;
import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.Credentials;
import org.apache.http.auth.UsernamePasswordCredentials;
-import org.apache.http.impl.client.AbstractHttpClient;
+import org.apache.http.client.CredentialsProvider;
+import org.apache.http.conn.routing.HttpRoute;
+import org.apache.http.conn.routing.HttpRoutePlanner;
+import org.apache.http.protocol.HttpContext;
/**
* settings for a remote proxy
@@ -71,12 +77,36 @@ public final class ProxySettings {
return new HttpHost(host, port);
}
- public static void setProxyCreds(AbstractHttpClient httpClient) {
- if (!use) return;
- httpClient.getCredentialsProvider().setCredentials(
- new AuthScope(host, port),
- new UsernamePasswordCredentials(user, password));
- }
+ public static HttpRoutePlanner RoutePlanner = new HttpRoutePlanner() {
+
+ @Override
+ public HttpRoute determineRoute(HttpHost target, HttpRequest request, HttpContext context) throws HttpException {
+ if (use) {
+ final Protocol protocol = "https".equalsIgnoreCase(target.getSchemeName())? Protocol.HTTPS : Protocol.HTTP;
+ if (useForHost(target.getHostName(), protocol))
+ return new HttpRoute(target, null, getProxyHost(), protocol == Protocol.HTTPS);
+ }
+ return new HttpRoute(target); // direct
+ }
+ };
+
+ public static CredentialsProvider CredsProvider = new CredentialsProvider() {
+
+ @Override
+ public void clear() {
+ }
+
+ @Override
+ public Credentials getCredentials(AuthScope scope) {
+ if (host != null && host.equals(scope.getHost()) && port == scope.getPort())
+ return new UsernamePasswordCredentials(user, password);
+ return null;
+ }
+
+ @Override
+ public void setCredentials(AuthScope arg0, Credentials arg1) {
+ }
+ };
/**
* tell if a remote proxy will be used for the given host
diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java
index 67cbabeb5..7a36de594 100644
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@@ -42,8 +42,6 @@ import org.openjena.atlas.logging.Log;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
@@ -51,7 +49,6 @@ import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
-import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.retrieval.Request;
@@ -293,37 +290,6 @@ public class Balancer {
return map;
}
- /**
- * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
- * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
- * @param robots
- * @param profileEntry
- * @param crawlURL
- * @return the sleep time in milliseconds; may be negative for no sleep time
- */
- private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
- if (profileEntry == null) return 0;
- long sleeptime = (
- profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
- (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
- ) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
- return sleeptime;
- }
-
- /**
- * load a robots.txt to get the robots time.
- * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
- * This shall therefore not be called in synchronized environments.
- * @param robots
- * @param profileEntry
- * @param crawlURL
- * @return
- */
- private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
- long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
- return sleeptime < 0 ? 0 : sleeptime;
- }
-
/**
* get lists of crawl request entries for a specific host
* @param host
@@ -428,13 +394,13 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
- profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
+ profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue;
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
- sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
+ sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
@@ -445,7 +411,7 @@ public class Balancer {
}
if (crawlEntry == null) return null;
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
- long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
+ long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here
@@ -515,7 +481,7 @@ public class Balancer {
rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) continue; // may have been deleted there manwhile
Request crawlEntry = new Request(rowEntry);
- CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
+ CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue;
diff --git a/source/net/yacy/crawler/CrawlQueue.java b/source/net/yacy/crawler/CrawlQueue.java
deleted file mode 100644
index ff6a4637e..000000000
--- a/source/net/yacy/crawler/CrawlQueue.java
+++ /dev/null
@@ -1,312 +0,0 @@
-/**
- * CrawlQueue
- * Copyright 2013 by Michael Christen
- * First released 30.08.2013 at http://yacy.net
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program in the file lgpl21.txt
- * If not, see .
- */
-
-package net.yacy.crawler;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Iterator;
-
-import net.yacy.cora.document.encoding.ASCII;
-import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.federate.yacy.CacheStrategy;
-import net.yacy.cora.order.Base64Order;
-import net.yacy.cora.protocol.ClientIdentification;
-import net.yacy.cora.storage.HandleSet;
-import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.cora.util.SpaceExceededException;
-import net.yacy.crawler.data.Cache;
-import net.yacy.crawler.data.CrawlProfile;
-import net.yacy.crawler.data.Latency;
-import net.yacy.crawler.retrieval.Request;
-import net.yacy.crawler.robots.RobotsTxt;
-import net.yacy.kelondro.data.meta.URIMetadataRow;
-import net.yacy.kelondro.index.BufferedObjectIndex;
-import net.yacy.kelondro.index.Row;
-import net.yacy.kelondro.index.RowHandleSet;
-import net.yacy.kelondro.table.Table;
-import net.yacy.kelondro.util.MemoryControl;
-import net.yacy.repository.Blacklist.BlacklistType;
-import net.yacy.search.Switchboard;
-
-public class CrawlQueue {
-
- private static final int EcoFSBufferSize = 1000;
- private static final int objectIndexBufferSize = 1000;
- private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
-
- private BufferedObjectIndex urlFileIndex;
- private final HandleSet double_push_check;
-
- public CrawlQueue(
- final File cachePath,
- final String filename,
- final boolean useTailCache,
- final boolean exceed134217727) {
-
- // create a stack for newly entered entries
- if (!(cachePath.exists())) cachePath.mkdir(); // make the path
- cachePath.mkdirs();
- final File f = new File(cachePath, filename);
- try {
- this.urlFileIndex = new BufferedObjectIndex(new Table(f, Request.rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true), objectIndexBufferSize);
- } catch (final SpaceExceededException e) {
- try {
- this.urlFileIndex = new BufferedObjectIndex(new Table(f, Request.rowdef, 0, 0, false, exceed134217727, true), objectIndexBufferSize);
- } catch (final SpaceExceededException e1) {
- ConcurrentLog.logException(e1);
- }
- }
- this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
- ConcurrentLog.info("CrawlQueue", "opened queue file with " + this.urlFileIndex.size() + " entries from " + f.toString());
- }
-
- public synchronized void close() {
- if (this.urlFileIndex != null) {
- this.urlFileIndex.close();
- this.urlFileIndex = null;
- }
- }
-
- public void clear() {
- ConcurrentLog.info("CrawlQueue", "cleaning CrawlQueue with " + this.urlFileIndex.size() + " entries from " + this.urlFileIndex.filename());
- try {
- this.urlFileIndex.clear();
- } catch (final IOException e) {
- ConcurrentLog.logException(e);
- }
- this.double_push_check.clear();
- }
-
- public Request get(final byte[] urlhash) throws IOException {
- assert urlhash != null;
- if (this.urlFileIndex == null) return null; // case occurs during shutdown
- final Row.Entry entry = this.urlFileIndex.get(urlhash, false);
- if (entry == null) return null;
- return new Request(entry);
- }
-
- public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
- // removes all entries with a specific profile hash.
- // this may last some time
- // returns number of deletions
-
- // first find a list of url hashes that shall be deleted
- final HandleSet urlHashes = new RowHandleSet(this.urlFileIndex.row().primaryKeyLength, Base64Order.enhancedCoder, 100);
- final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
- synchronized (this) {
- final Iterator i = this.urlFileIndex.rows();
- Row.Entry rowEntry;
- Request crawlEntry;
- while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
- rowEntry = i.next();
- crawlEntry = new Request(rowEntry);
- if (crawlEntry.profileHandle().equals(profileHandle)) {
- urlHashes.put(crawlEntry.url().hash());
- }
- }
- }
-
- // then delete all these urls from the queues and the file index
- return remove(urlHashes);
- }
-
- /**
- * this method is only here, because so many import/export methods need it
- and it was implemented in the previous architecture
- however, usage is not recommended
- * @param urlHashes, a list of hashes that shall be removed
- * @return number of entries that had been removed
- * @throws IOException
- */
- public synchronized int remove(final HandleSet urlHashes) throws IOException {
- final int s = this.urlFileIndex.size();
- int removedCounter = 0;
- for (final byte[] urlhash: urlHashes) {
- final Row.Entry entry = this.urlFileIndex.remove(urlhash);
- if (entry != null) removedCounter++;
-
- // remove from double-check caches
- this.double_push_check.remove(urlhash);
- }
- if (removedCounter == 0) return 0;
- assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s;
-
- return removedCounter;
- }
-
- public boolean has(final byte[] urlhashb) {
- return this.urlFileIndex.has(urlhashb) || this.double_push_check.has(urlhashb);
- }
-
- public int size() {
- return this.urlFileIndex.size();
- }
-
- public boolean isEmpty() {
- return this.urlFileIndex.isEmpty();
- }
-
- /**
- * push a crawl request on the balancer stack
- * @param entry
- * @return null if this was successful or a String explaining what went wrong in case of an error
- * @throws IOException
- * @throws SpaceExceededException
- */
- public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
- assert entry != null;
- final byte[] hash = entry.url().hash();
- synchronized (this) {
- // double-check
- if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
- if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
-
- if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
- this.double_push_check.put(hash);
-
- // increase dom counter
- if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
- profile.domInc(entry.url().getHost());
- }
-
- // add to index
- final int s = this.urlFileIndex.size();
- this.urlFileIndex.put(entry.toRow());
- assert s < this.urlFileIndex.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size();
- assert this.urlFileIndex.has(hash) : "hash = " + ASCII.String(hash);
-
- // add the hash to a queue if the host is unknown to get this fast into the balancer
- // now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
- //if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
- }
- robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
- return null;
- }
-
- /**
- * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
- * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
- * @param robots
- * @param profileEntry
- * @param crawlURL
- * @return the sleep time in milliseconds; may be negative for no sleep time
- */
- private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
- if (profileEntry == null) return 0;
- long sleeptime = (
- profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
- (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
- ) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
- return sleeptime;
- }
-
- /**
- * load a robots.txt to get the robots time.
- * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
- * This shall therefore not be called in synchronized environments.
- * @param robots
- * @param profileEntry
- * @param crawlURL
- * @return
- */
- private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
- long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
- return sleeptime < 0 ? 0 : sleeptime;
- }
-
- /**
- * get the next entry in this crawl queue in such a way that the domain access time delta is maximized
- * and always above the given minimum delay time. An additional delay time is computed using the robots.txt
- * crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
- * the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
- * for the computed Entry does not exist, null is returned
- * @param delay true if the requester demands forced delays using explicit thread sleep
- * @param profile
- * @return a url in a CrawlEntry object
- * @throws IOException
- * @throws SpaceExceededException
- */
- public Request pop(final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException {
- // returns a crawl entry from the stack and ensures minimum delta times
-
- if (this.urlFileIndex.isEmpty()) return null;
- long sleeptime = 0;
- Request crawlEntry = null;
- CrawlProfile profileEntry = null;
- while (this.urlFileIndex.size() > 0) {
- synchronized (this) {
- Row.Entry rowEntry = this.urlFileIndex.removeOne();
- if (rowEntry == null) return null;
- crawlEntry = new Request(rowEntry);
- profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
- if (profileEntry == null) {
- ConcurrentLog.warn("CrawlQueue", "no profile entry for handle " + crawlEntry.profileHandle());
- return null;
- }
-
- // check blacklist (again) because the user may have created blacklist entries after the queue has been filled
- if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
- ConcurrentLog.fine("CrawlQueue", "URL '" + crawlEntry.url() + "' is in blacklist.");
- continue;
- }
-
- // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
- // if not: return null. A calling method must handle the null value and try again
- profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
- if (profileEntry == null) {
- ConcurrentLog.warn("CrawlQueue", "no profile entry for handle " + crawlEntry.profileHandle());
- continue;
- }
- }
- }
- // depending on the caching policy we need sleep time to avoid DoS-like situations
- sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
-
- ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
- long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
- Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
- if (delay && sleeptime > 0) {
- // force a busy waiting here
- // in best case, this should never happen if the balancer works propertly
- // this is only to protection against the worst case, where the crawler could
- // behave in a DoS-manner
- ConcurrentLog.info("CrawlQueue", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent));
- long loops = sleeptime / 1000;
- long rest = sleeptime % 1000;
- if (loops < 3) {
- rest = rest + 1000 * loops;
- loops = 0;
- }
- Thread.currentThread().setName("CrawlQueue waiting for " +crawlEntry.url().getHost() + ": " + sleeptime + " milliseconds");
- synchronized(this) {
- // must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough
- if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}}
- for (int i = 0; i < loops; i++) {
- ConcurrentLog.info("CrawlQueue", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
- try {this.wait(1000); } catch (final InterruptedException e) {}
- }
- }
- Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
- }
- return crawlEntry;
- }
-}
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index 5ff4cbe5e..79f87177a 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -48,8 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL;
-import net.yacy.crawler.data.ResultURLs;
-import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
@@ -149,7 +147,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null && !rejectReason.startsWith("double in")) {
- final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
+ final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
} catch (final Exception e) {
@@ -294,7 +292,8 @@ public final class CrawlStacker {
public String stackCrawl(final Request entry) {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
- final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
+ byte[] handle = UTF8.getBytes(entry.profileHandle());
+ final CrawlProfile profile = this.crawler.get(handle);
String error;
if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
@@ -302,7 +301,9 @@ public final class CrawlStacker {
return error;
}
- error = checkAcceptance(entry.url(), profile, entry.depth());
+ error = checkAcceptanceChangeable(entry.url(), profile, entry.depth());
+ if (error != null) return error;
+ error = checkAcceptanceInitially(entry.url(), profile);
if (error != null) return error;
// store information
@@ -366,53 +367,16 @@ public final class CrawlStacker {
return null;
}
- public String checkAcceptance(final DigestURL url, final CrawlProfile profile, final int depth) {
+ /**
+ * Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl
+ * These tests are incomplete and must be followed with an checkAcceptanceChangeable - test.
+ * @param url
+ * @param profile
+ * @return null if the url is accepted, an error string in case if the url is not accepted with an error description
+ */
+ public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) {
- // check if the protocol is supported
- final String urlProtocol = url.getProtocol();
final String urlstring = url.toString();
- if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
- this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
- return "unsupported protocol";
- }
-
- // check if ip is local ip address
- final String urlRejectReason = urlInAcceptedDomain(url);
- if (urlRejectReason != null) {
- if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
- return "denied_(" + urlRejectReason + ")";
- }
-
- // check blacklist
- if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
- this.log.fine("URL '" + urlstring + "' is in blacklist.");
- return "url in blacklist";
- }
-
- // filter with must-match for URLs
- if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
- if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
- return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
- }
-
- // filter with must-not-match for URLs
- if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
- if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
- return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
- }
-
- // deny cgi
- if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
- if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
- return "individual url (sessionid etc) not wanted";
- }
-
- // deny post properties
- if (url.isPOST() && !profile.crawlingQ()) {
- if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
- return "post url not allowed";
- }
-
// check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
@@ -451,13 +415,72 @@ public final class CrawlStacker {
final AtomicInteger dp = profile.getCount(url.getHost());
if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
- return "crawl stack domain counter exceeded";
+ return "crawl stack domain counter exceeded (test by profile)";
}
+ /*
if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
- return "result stack domain counter exceeded";
+ return "result stack domain counter exceeded (test by domainCount)";
}
+ */
+ }
+
+ return null;
+ }
+
+ /**
+ * Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl.
+ * @param url
+ * @param profile
+ * @param depth
+ * @return null if the url is accepted, an error string in case if the url is not accepted with an error description
+ */
+ public String checkAcceptanceChangeable(final DigestURL url, final CrawlProfile profile, final int depth) {
+
+ // check if the protocol is supported
+ final String urlProtocol = url.getProtocol();
+ final String urlstring = url.toString();
+ if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
+ this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
+ return "unsupported protocol";
+ }
+
+ // check if ip is local ip address
+ final String urlRejectReason = urlInAcceptedDomain(url);
+ if (urlRejectReason != null) {
+ if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
+ return "denied_(" + urlRejectReason + ")";
+ }
+
+ // check blacklist
+ if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
+ this.log.fine("URL '" + urlstring + "' is in blacklist.");
+ return "url in blacklist";
+ }
+
+ // filter with must-match for URLs
+ if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
+ if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
+ return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
+ }
+
+ // filter with must-not-match for URLs
+ if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
+ if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
+ return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
+ }
+
+ // deny cgi
+ if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
+ if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
+ return "individual url (sessionid etc) not wanted";
+ }
+
+ // deny post properties
+ if (url.isPOST() && !profile.crawlingQ()) {
+ if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
+ return "post url not allowed";
}
// the following filters use a DNS lookup to check if the url matches with IP filter
@@ -498,7 +521,6 @@ public final class CrawlStacker {
return null;
}
-
/**
* Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global)
diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java
index d2ecf0437..b5053ee52 100644
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@@ -80,8 +80,8 @@ public final class CrawlSwitchboard {
DEFAULT_PROFILES.add(CRAWL_PROFILE_SURROGATE);
}
- public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
- public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
+ public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive1.heap";
+ public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@@ -103,21 +103,23 @@ public final class CrawlSwitchboard {
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
+ private Switchboard switchboard;
- public CrawlSwitchboard(final String networkName, final ConcurrentLog log, final File queuesRoot) {
+ public CrawlSwitchboard(final String networkName, Switchboard switchboard) {
- log.info("Initializing Word Index for the network '" + networkName + "'.");
+ this.switchboard = switchboard;
+ this.log = this.switchboard.log;
+ this.queuesRoot = this.switchboard.queuesRoot;
+ this.log.info("Initializing Word Index for the network '" + networkName + "'.");
if ( networkName == null || networkName.isEmpty() ) {
log.severe("no network name given - shutting down");
System.exit(0);
}
- this.log = log;
this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap(Base64Order.enhancedCoder));
this.profilesActiveCrawlsCounter = new ConcurrentHashMap();
// make crawl profiles database and default profiles
- this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs();
this.log.config("Initializing Crawl Profiles");
@@ -166,6 +168,23 @@ public final class CrawlSwitchboard {
/ 1024);
}
+ /**
+ * Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles.
+ * A profile that was discovered from the passive stack is automatically shifted back to the active stack.
+ * @param profileKey
+ * @return
+ */
+ public CrawlProfile get(final byte[] profileKey) {
+ CrawlProfile profile = getActive(profileKey);
+ if (profile != null) return profile;
+ profile = getPassive(profileKey);
+ if (profile == null) return null;
+ // clean up
+ this.putActive(profileKey, profile);
+ this.removePassive(profileKey);
+ return profile;
+ }
+
public CrawlProfile getActive(final byte[] profileKey) {
if ( profileKey == null ) {
return null;
@@ -237,10 +256,12 @@ public final class CrawlSwitchboard {
public void putActive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesActiveCrawls.put(profileKey, profile);
this.profilesActiveCrawlsCache.put(profileKey, profile);
+ this.removePassive(profileKey);
}
public void putPassive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesPassiveCrawls.put(profileKey, profile);
+ this.removeActive(profileKey);
}
public RowHandleSet getURLHashes(final byte[] profileKey) {
@@ -534,7 +555,7 @@ public final class CrawlSwitchboard {
return hasDoneSomething;
}
- public int cleanFinishesProfiles(CrawlQueues crawlQueues) {
+ public Set getFinishesProfiles(CrawlQueues crawlQueues) {
// clear the counter cache
this.profilesActiveCrawlsCounter.clear();
@@ -547,7 +568,7 @@ public final class CrawlSwitchboard {
deletionCandidate.add(ASCII.String(handle));
}
}
- if (deletionCandidate.size() == 0) return 0;
+ if (deletionCandidate.size() == 0) return new HashSet(0);
// iterate through all the queues and see if one of these handles appear there
// this is a time-consuming process, set a time-out
@@ -564,15 +585,24 @@ public final class CrawlSwitchboard {
if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);}
if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many
deletionCandidate.remove(handle);
- if (deletionCandidate.size() == 0) return 0;
- if (System.currentTimeMillis() > timeout) return 0; // give up; this is too large
+ if (deletionCandidate.size() == 0) return new HashSet(0);
+ if (System.currentTimeMillis() > timeout) return new HashSet(0); // give up; this is too large
}
- if (deletionCandidate.size() == 0) return 0;
+ if (deletionCandidate.size() == 0) return new HashSet(0);
+ }
+ // look into the CrawlQueues.worker as well
+ Request[] requests = switchboard.crawlQueues.activeWorkerEntries();
+ for (Request request: requests) {
+ deletionCandidate.remove(request.profileHandle());
}
} catch (final Throwable e) {
- return 0;
+ ConcurrentLog.logException(e);
+ return new HashSet(0);
}
-
+ return deletionCandidate;
+ }
+
+ public void cleanProfiles(Set deletionCandidate) {
// all entries that are left are candidates for deletion; do that now
for (String h: deletionCandidate) {
byte[] handle = ASCII.getBytes(h);
@@ -582,7 +612,6 @@ public final class CrawlSwitchboard {
this.removeActive(handle);
}
}
- return deletionCandidate.size();
}
public synchronized void close() {
diff --git a/source/net/yacy/crawler/HostQueue.java b/source/net/yacy/crawler/HostQueue.java
new file mode 100644
index 000000000..c78ac989d
--- /dev/null
+++ b/source/net/yacy/crawler/HostQueue.java
@@ -0,0 +1,256 @@
+/**
+ * HostQueue
+ * Copyright 2013 by Michael Christen
+ * First released 24.09.2013 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+package net.yacy.crawler;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+
+import net.yacy.cora.document.encoding.ASCII;
+import net.yacy.cora.order.Base64Order;
+import net.yacy.cora.storage.HandleSet;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.cora.util.SpaceExceededException;
+import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.retrieval.Request;
+import net.yacy.crawler.robots.RobotsTxt;
+import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.index.BufferedObjectIndex;
+import net.yacy.kelondro.index.Row;
+import net.yacy.kelondro.index.RowHandleSet;
+import net.yacy.kelondro.table.Table;
+import net.yacy.kelondro.util.MemoryControl;
+import net.yacy.repository.Blacklist.BlacklistType;
+import net.yacy.search.Switchboard;
+
+public class HostQueue {
+
+ public static final String indexSuffix = ".stack";
+ private static final int EcoFSBufferSize = 1000;
+ private static final int objectIndexBufferSize = 1000;
+ private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
+
+ private final String hostHash;
+ private final File queuesPath;
+ private BufferedObjectIndex requestStack;
+ private HandleSet urlHashDoubleCheck;
+
+ public HostQueue(
+ final File queuesPath,
+ final String hostHash,
+ final boolean useTailCache,
+ final boolean exceed134217727) {
+ this.hostHash = hostHash;
+ this.queuesPath = queuesPath;
+ this.urlHashDoubleCheck = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
+
+ // create a stack for newly entered entries
+ if (!(this.queuesPath.exists())) this.queuesPath.mkdir(); // make the path
+ this.queuesPath.mkdirs();
+ final File f = new File(this.queuesPath, this.hostHash + indexSuffix);
+ try {
+ this.requestStack = new BufferedObjectIndex(new Table(f, Request.rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true), objectIndexBufferSize);
+ } catch (final SpaceExceededException e) {
+ try {
+ this.requestStack = new BufferedObjectIndex(new Table(f, Request.rowdef, 0, 0, false, exceed134217727, true), objectIndexBufferSize);
+ } catch (final SpaceExceededException e1) {
+ ConcurrentLog.logException(e1);
+ }
+ }
+ ConcurrentLog.info("Balancer", "opened balancer file with " + this.requestStack.size() + " entries from " + f.toString());
+ }
+
+ public synchronized void close() {
+ int sizeBeforeClose = this.size();
+ if (this.urlHashDoubleCheck != null) {
+ this.urlHashDoubleCheck.clear();
+ this.urlHashDoubleCheck = null;
+ }
+ if (this.requestStack != null) {
+ this.requestStack.close();
+ this.requestStack = null;
+ }
+ if (sizeBeforeClose == 0) {
+ // clean up
+ new File(this.queuesPath, this.hostHash + indexSuffix).delete();
+ }
+ }
+
+ public void clear() {
+ try {
+ this.requestStack.clear();
+ } catch (final IOException e) {
+ ConcurrentLog.logException(e);
+ }
+ this.urlHashDoubleCheck.clear();
+ }
+
+ public Request get(final byte[] urlhash) throws IOException {
+ assert urlhash != null;
+ if (this.requestStack == null) return null; // case occurs during shutdown
+ final Row.Entry entry = this.requestStack.get(urlhash, false);
+ if (entry == null) return null;
+ return new Request(entry);
+ }
+
+ public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
+ // first find a list of url hashes that shall be deleted
+ final HandleSet urlHashes = new RowHandleSet(this.requestStack.row().primaryKeyLength, Base64Order.enhancedCoder, 100);
+ final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
+ synchronized (this) {
+ final Iterator i = this.requestStack.rows();
+ Row.Entry rowEntry;
+ Request crawlEntry;
+ while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
+ rowEntry = i.next();
+ crawlEntry = new Request(rowEntry);
+ if (crawlEntry.profileHandle().equals(profileHandle)) {
+ urlHashes.put(crawlEntry.url().hash());
+ }
+ }
+ }
+
+ // then delete all these urls from the queues and the file index
+ return remove(urlHashes);
+ }
+
+ /**
+ * remove urls from the queue
+ * @param urlHashes, a list of hashes that shall be removed
+ * @return number of entries that had been removed
+ * @throws IOException
+ */
+ public synchronized int remove(final HandleSet urlHashes) throws IOException {
+ final int s = this.requestStack.size();
+ int removedCounter = 0;
+ for (final byte[] urlhash: urlHashes) {
+ final Row.Entry entry = this.requestStack.remove(urlhash);
+ if (entry != null) removedCounter++;
+
+ // remove from double-check caches
+ this.urlHashDoubleCheck.remove(urlhash);
+ }
+ if (removedCounter == 0) return 0;
+ assert this.requestStack.size() + removedCounter == s : "urlFileIndex.size() = " + this.requestStack.size() + ", s = " + s;
+ return removedCounter;
+ }
+
+ public boolean has(final byte[] urlhashb) {
+ return this.requestStack.has(urlhashb) || this.urlHashDoubleCheck.has(urlhashb);
+ }
+
+ public int size() {
+ return this.requestStack.size();
+ }
+
+ public boolean isEmpty() {
+ return this.requestStack.isEmpty();
+ }
+
+ public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
+ assert entry != null;
+ final byte[] hash = entry.url().hash();
+ synchronized (this) {
+ // double-check
+ if (this.urlHashDoubleCheck.has(hash)) return "double occurrence in double_push_check";
+ if (this.requestStack.has(hash)) return "double occurrence in urlFileIndex";
+
+ if (this.urlHashDoubleCheck.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.urlHashDoubleCheck.clear();
+ this.urlHashDoubleCheck.put(hash);
+
+ // increase dom counter
+ if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
+ profile.domInc(entry.url().getHost());
+ }
+
+ // add to index
+ final int s = this.requestStack.size();
+ this.requestStack.put(entry.toRow());
+ assert s < this.requestStack.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.requestStack.size();
+ assert this.requestStack.has(hash) : "hash = " + ASCII.String(hash);
+
+ // add the hash to a queue if the host is unknown to get this fast into the balancer
+ // now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
+ //if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
+ }
+ robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
+ return null;
+ }
+
+ public Request pop() throws IOException {
+ // returns a crawl entry from the stack and ensures minimum delta times
+
+ Request crawlEntry = null;
+ while (!this.requestStack.isEmpty()) {
+ synchronized (this) {
+ Row.Entry rowEntry = this.requestStack.removeOne();
+ if (rowEntry == null) return null;
+ crawlEntry = new Request(rowEntry);
+
+ // check blacklist (again) because the user may have created blacklist entries after the queue has been filled
+ if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
+ ConcurrentLog.fine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist.");
+ continue;
+ }
+ break;
+ }
+ }
+ if (crawlEntry == null) return null;
+ return crawlEntry;
+ }
+
+ public Iterator iterator() throws IOException {
+ return new EntryIterator();
+ }
+
+ private class EntryIterator implements Iterator {
+
+ private Iterator rowIterator;
+
+ public EntryIterator() throws IOException {
+ this.rowIterator = HostQueue.this.requestStack.rows();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return (this.rowIterator == null) ? false : this.rowIterator.hasNext();
+ }
+
+ @Override
+ public Request next() {
+ final Row.Entry entry = this.rowIterator.next();
+ try {
+ return (entry == null) ? null : new Request(entry);
+ } catch (final IOException e) {
+ ConcurrentLog.logException(e);
+ this.rowIterator = null;
+ return null;
+ }
+ }
+
+ @Override
+ public void remove() {
+ if (this.rowIterator != null) this.rowIterator.remove();
+ }
+
+ }
+
+}
diff --git a/source/net/yacy/crawler/HostQueues.java b/source/net/yacy/crawler/HostQueues.java
new file mode 100644
index 000000000..37085a782
--- /dev/null
+++ b/source/net/yacy/crawler/HostQueues.java
@@ -0,0 +1,169 @@
+/**
+ * HostQueues
+ * Copyright 2013 by Michael Christen
+ * First released 24.09.2013 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+package net.yacy.crawler;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import net.yacy.cora.document.encoding.ASCII;
+import net.yacy.cora.order.Base64Order;
+import net.yacy.cora.storage.HandleSet;
+import net.yacy.cora.util.SpaceExceededException;
+import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.retrieval.Request;
+import net.yacy.crawler.robots.RobotsTxt;
+import net.yacy.kelondro.data.word.Word;
+import net.yacy.kelondro.index.RowHandleSet;
+
+/**
+ * wrapper for single HostQueue queues; this is a collection of such queues.
+ * All these queues are stored in a common directory for the queue stacks
+ */
+public class HostQueues {
+
+ private final File queuesPath;
+ private final boolean useTailCache;
+ private final boolean exceed134217727;
+ private final Map queues;
+
+ public HostQueues(
+ final File queuesPath,
+ final boolean useTailCache,
+ final boolean exceed134217727) {
+ this.queuesPath = queuesPath;
+ this.useTailCache = useTailCache;
+ this.exceed134217727 = exceed134217727;
+
+ // create a stack for newly entered entries
+ if (!(queuesPath.exists())) queuesPath.mkdir(); // make the path
+ this.queuesPath.mkdirs();
+ this.queues = new HashMap();
+ String[] list = this.queuesPath.list();
+ for (String queuefile: list) {
+ if (queuefile.endsWith(HostQueue.indexSuffix)) {
+ String hosthash = queuefile.substring(0, queuefile.length() - HostQueue.indexSuffix.length());
+ HostQueue queue = new HostQueue(this.queuesPath, hosthash, this.useTailCache, this.exceed134217727);
+ this.queues.put(hosthash, queue);
+ }
+ }
+ }
+
+ public synchronized void close() {
+ for (HostQueue queue: this.queues.values()) queue.close();
+ this.queues.clear();
+ }
+
+ public void clear() {
+ for (HostQueue queue: this.queues.values()) queue.clear();
+ this.queues.clear();
+ }
+
+ public Request get(final byte[] urlhash) throws IOException {
+ String hosthash = ASCII.String(urlhash, 6, 6);
+ HostQueue queue = this.queues.get(hosthash);
+ if (queue == null) return null;
+ return queue.get(urlhash);
+ }
+
+ public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
+ int c = 0;
+ for (HostQueue queue: this.queues.values()) c += queue.removeAllByProfileHandle(profileHandle, timeout);
+ return c;
+ }
+
+ public synchronized int remove(final HandleSet urlHashes) throws IOException {
+ Map removeLists = new HashMap();
+ for (byte[] urlhash: urlHashes) {
+ String hosthash = ASCII.String(urlhash, 6, 6);
+ HandleSet removeList = removeLists.get(hosthash);
+ if (removeList == null) {
+ removeList = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 100);
+ removeLists.put(hosthash, removeList);
+ }
+ try {removeList.put(urlhash);} catch (SpaceExceededException e) {}
+ }
+ int c = 0;
+ for (Map.Entry entry: removeLists.entrySet()) {
+ HostQueue queue = this.queues.get(entry.getKey());
+ if (queue != null) c += queue.remove(entry.getValue());
+ }
+ return c;
+ }
+
+ public boolean has(final byte[] urlhashb) {
+ String hosthash = ASCII.String(urlhashb, 6, 6);
+ HostQueue queue = this.queues.get(hosthash);
+ if (queue == null) return false;
+ return queue.has(urlhashb);
+ }
+
+ public int size() {
+ int c = 0;
+ for (HostQueue queue: this.queues.values()) c += queue.size();
+ return c;
+ }
+
+ public boolean isEmpty() {
+ for (HostQueue queue: this.queues.values()) if (!queue.isEmpty()) return false;
+ return true;
+ }
+
+ /**
+ * push a request to one of the host queues. If the queue does not exist, it is created
+ * @param entry
+ * @param profile
+ * @param robots
+ * @return null if everything is ok or a string with an error message if the push is not allowed according to the crawl profile or robots
+ * @throws IOException
+ * @throws SpaceExceededException
+ */
+ public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
+ String hosthash = ASCII.String(entry.url().hash(), 6, 6);
+ HostQueue queue = this.queues.get(hosthash);
+ if (queue == null) {
+ queue = new HostQueue(this.queuesPath, hosthash, this.useTailCache, this.exceed134217727);
+ this.queues.put(hosthash, queue);
+ }
+ return queue.push(entry, profile, robots);
+ }
+
+ /**
+ * remove one request from all stacks except from those as listed in notFromHost
+ * @param notFromHost do not collect from these hosts
+ * @return a list of requests
+ * @throws IOException
+ */
+ public List pop(Set notFromHost) throws IOException {
+ ArrayList requests = new ArrayList();
+ for (Map.Entry entry: this.queues.entrySet()) {
+ if (notFromHost.contains(entry.getKey())) continue;
+ Request r = entry.getValue().pop();
+ if (r != null) requests.add(r);
+ }
+ return requests;
+ }
+
+}
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 03efd21b9..819a84065 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -148,7 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
}
if (name.length() > 256) name = name.substring(256);
this.doms = new ConcurrentHashMap();
- final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength);
+ final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength);
put(HANDLE, handle);
put(NAME, name);
put(AGENT_NAME, userAgentName);
diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java
index be18de809..7da49f812 100644
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@@ -255,7 +255,7 @@ public class CrawlQueues {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
- final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle));
+ final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(profileHandle));
if (profile == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
@@ -269,14 +269,13 @@ public class CrawlQueues {
if (urlEntry == null) {
continue;
}
- final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
- if (profileHandle == null) {
+ if (urlEntry.profileHandle() == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
- load(urlEntry, stats, profileHandle);
+ load(urlEntry, stats);
return true;
} catch (final IOException e) {
this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
@@ -296,8 +295,8 @@ public class CrawlQueues {
* @param stats String for log prefixing
* @return
*/
- private void load(final Request urlEntry, final String stats, final String profileHandle) {
- final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
+ private void load(final Request urlEntry, final String stats) {
+ final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(urlEntry.profileHandle()));
if (profile != null) {
// check if the protocol is supported
@@ -574,11 +573,7 @@ public class CrawlQueues {
try {
final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler, this.sb.robots);
if (urlEntry == null) return false;
- final String profileHandle = urlEntry.profileHandle();
- // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
- // profileHandle = " + profileHandle + ", urlEntry.url = " +
- // urlEntry.url());
- load(urlEntry, stats, profileHandle);
+ load(urlEntry, stats);
return true;
} catch (final IOException e) {
this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
@@ -606,7 +601,7 @@ public class CrawlQueues {
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
- this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
+ this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle()));
}
private long age() {
diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java
index 34bd026eb..2e74dabb7 100644
--- a/source/net/yacy/crawler/data/Latency.java
+++ b/source/net/yacy/crawler/data/Latency.java
@@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry;
@@ -262,6 +263,37 @@ public class Latency {
return s.toString();
}
+ /**
+ * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
+ * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
+ * @param robots
+ * @param profileEntry
+ * @param crawlURL
+ * @return the sleep time in milliseconds; may be negative for no sleep time
+ */
+ public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
+ if (profileEntry == null) return 0;
+ long sleeptime = (
+ profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
+ (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
+ ) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+ return sleeptime;
+ }
+
+ /**
+ * load a robots.txt to get the robots time.
+ * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
+ * This shall therefore not be called in synchronized environments.
+ * @param robots
+ * @param profileEntry
+ * @param crawlURL
+ * @return
+ */
+ public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
+ long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+ return sleeptime < 0 ? 0 : sleeptime;
+ }
+
public static final class Host {
private AtomicLong timeacc;
private AtomicLong lastacc;
diff --git a/source/net/yacy/crawler/retrieval/FTPLoader.java b/source/net/yacy/crawler/retrieval/FTPLoader.java
index 81bc12e68..aaf3b6c2a 100644
--- a/source/net/yacy/crawler/retrieval/FTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/FTPLoader.java
@@ -101,7 +101,7 @@ public class FTPLoader {
// create new ftp client
final FTPClient ftpClient = new FTPClient();
- final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+ final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
// get a connection
if (openConnection(ftpClient, entryUrl)) {
// test if the specified file is a directory
@@ -249,7 +249,7 @@ public class FTPLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
- final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+ final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response(
request,
requestHeader,
@@ -264,7 +264,7 @@ public class FTPLoader {
final byte[] b = ftpClient.get(path);
// create a response
- final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+ final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response(
request,
requestHeader,
diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java
index 06d8bde3c..676ab9d4f 100644
--- a/source/net/yacy/crawler/retrieval/FileLoader.java
+++ b/source/net/yacy/crawler/retrieval/FileLoader.java
@@ -83,7 +83,7 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
- final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+ final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@@ -123,7 +123,7 @@ public class FileLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
- final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+ final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@@ -140,7 +140,7 @@ public class FileLoader {
is.close();
// create response with loaded content
- final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+ final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java
index 17b9d8c16..e6b859115 100644
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@@ -589,7 +589,7 @@ public class Response {
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
- final Date ifModifiedSince = this.requestHeader.ifModifiedSince();
+ final Date ifModifiedSince = this.ifModifiedSince();
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
// parse date
Date d = this.responseHeader.lastModified();
diff --git a/source/net/yacy/crawler/retrieval/SMBLoader.java b/source/net/yacy/crawler/retrieval/SMBLoader.java
index 56c2adca1..302bf6bca 100644
--- a/source/net/yacy/crawler/retrieval/SMBLoader.java
+++ b/source/net/yacy/crawler/retrieval/SMBLoader.java
@@ -101,7 +101,7 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
- final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+ final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@@ -141,7 +141,7 @@ public class SMBLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
- final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
+ final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
@@ -158,7 +158,7 @@ public class SMBLoader {
is.close();
// create response with loaded content
- final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
+ final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java
index d2732b95d..3075158a3 100644
--- a/source/net/yacy/peers/Protocol.java
+++ b/source/net/yacy/peers/Protocol.java
@@ -575,7 +575,7 @@ public final class Protocol {
maximumRecords,
verify,
global,
- null);
+ ClientIdentification.yacyInternetCrawlerAgent);
}
protected static int primarySearch(
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index 7f7d59836..d421d5258 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -187,7 +187,7 @@ public final class LoaderDispatcher {
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
final String protocol = url.getProtocol();
final String host = url.getHost();
- final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
+ final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 70cb1a180..86294f2ae 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -536,7 +536,7 @@ public final class Switchboard extends serverSwitch {
}
// create a crawler
- this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot);
+ this.crawler = new CrawlSwitchboard(networkName, this);
// start yacy core
this.log.config("Starting YaCy Protocol Core");
@@ -1330,7 +1330,7 @@ public final class Switchboard extends serverSwitch {
// create a crawler
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
- this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot);
+ this.crawler = new CrawlSwitchboard(networkName, this);
// init a DHT transmission dispatcher
this.dhtDispatcher =
@@ -2009,7 +2009,7 @@ public final class Switchboard extends serverSwitch {
// clear caches
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
Word.clearCache();
- Domains.clear();
+ // Domains.clear();
// clean up image stack
ResultImages.clearQueues();
@@ -2130,9 +2130,24 @@ public final class Switchboard extends serverSwitch {
// clean up profiles
checkInterruption();
- //cleanProfiles();
- int cleanup = this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? 0 : this.crawler.cleanFinishesProfiles(this.crawlQueues);
- if (cleanup > 0) log.info("cleanup removed " + cleanup + " crawl profiles");
+
+ if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
+ Set deletionCandidates = this.crawler.getFinishesProfiles(this.crawlQueues);
+ int cleanup = deletionCandidates.size();
+ if (cleanup > 0) {
+ // run postprocessing on these profiles
+ postprocessingRunning = true;
+ int proccount = 0;
+ for (String profileHash: deletionCandidates) {
+ proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, profileHash);
+ proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, profileHash);
+ }
+ postprocessingRunning = false;
+
+ this.crawler.cleanProfiles(deletionCandidates);
+ log.info("cleanup removed " + cleanup + " crawl profiles, post-processed " + proccount + " documents");
+ }
+ }
// clean up news
checkInterruption();
@@ -2268,11 +2283,14 @@ public final class Switchboard extends serverSwitch {
// if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned
if (this.crawlQueues.coreCrawlJobSize() == 0) {
- if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches
+ if (this.crawlQueues.noticeURL.isEmpty()) {
+ Domains.clear();
+ this.crawlQueues.noticeURL.clear(); // flushes more caches
+ }
postprocessingRunning = true;
int proccount = 0;
- proccount += index.fulltext().getDefaultConfiguration().postprocessing(index);
- proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index);
+ proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, null);
+ proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, null);
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
@@ -2490,13 +2508,13 @@ public final class Switchboard extends serverSwitch {
if (response.profile() != null) {
ArrayList newDocs = new ArrayList();
for (Document doc: documents) {
- String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
+ String rejectReason = this.crawlStacker.checkAcceptanceChangeable(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
if (rejectReason == null) {
newDocs.add(doc);
} else {
// we consider this as fail urls to have a tracking of the problem
if (rejectReason != null && !rejectReason.startsWith("double in")) {
- final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle()));
+ final CrawlProfile profile = this.crawler.get(UTF8.getBytes(response.profile().handle()));
this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
}
@@ -2659,18 +2677,28 @@ public final class Switchboard extends serverSwitch {
// the condenser may be null in case that an indexing is not wanted (there may be a no-indexing flag in the file)
if ( in.condenser != null ) {
for ( int i = 0; i < in.documents.length; i++ ) {
+ CrawlProfile profile = in.queueEntry.profile();
storeDocumentIndex(
in.queueEntry,
in.queueEntry.profile().collections(),
in.documents[i],
in.condenser[i],
null,
- "crawler/indexing queue");
+ profile == null ? "crawler" : profile.handle());
}
}
in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED);
}
+ /**
+ *
+ * @param queueEntry
+ * @param collections
+ * @param document
+ * @param condenser
+ * @param searchEvent
+ * @param sourceName if this document was created by a crawl, then the sourceName contains the crawl hash
+ */
private void storeDocumentIndex(
final Response queueEntry,
final Map collections,
@@ -2821,7 +2849,7 @@ public final class Switchboard extends serverSwitch {
public void stackURLs(Set rootURLs, final CrawlProfile profile, final Set successurls, final Map failurls) {
if (rootURLs == null || rootURLs.size() == 0) return;
- List stackthreads = new ArrayList(); // do this concurrently
+ final List stackthreads = new ArrayList(); // do this concurrently
for (DigestURL url: rootURLs) {
final DigestURL turl = url;
Thread t = new Thread() {
@@ -2832,9 +2860,9 @@ public final class Switchboard extends serverSwitch {
};
t.start();
stackthreads.add(t);
- try {Thread.sleep(10);} catch (final InterruptedException e) {} // to prevent that this fires more than 100 connections pre second!
+ try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second!
}
- long waitingtime = 1 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
+ final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}
}
@@ -2974,8 +3002,8 @@ public final class Switchboard extends serverSwitch {
continue;
}
final Request request = this.loader.request(e.getValue(), true, true);
- final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
- final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0);
+ final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
+ final String acceptedError = this.crawlStacker.checkAcceptanceChangeable(e.getValue(), profile, 0);
if (acceptedError != null) {
this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
continue;
@@ -3004,7 +3032,7 @@ public final class Switchboard extends serverSwitch {
final Document[] documents = response.parse();
if (documents != null) {
for (final Document document: documents) {
- final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle()));
+ final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle()));
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url);
}
@@ -3047,8 +3075,9 @@ public final class Switchboard extends serverSwitch {
if (existingids.contains(e.getKey())) continue; // double
DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true);
- final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
- final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
+ final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
+ String acceptedError = this.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
+ if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile);
if (acceptedError != null) {
this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
return;
diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java
index abf4fc67e..6ebb21dc1 100644
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@@ -599,7 +599,7 @@ public class Segment {
final Document document,
final Condenser condenser,
final SearchEvent searchEvent,
- final String sourceName,
+ final String sourceName, // contains the crawl profile hash if this comes from a web crawl
final boolean storeToRWI
) {
final long startTime = System.currentTimeMillis();
@@ -619,7 +619,7 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
- final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
+ final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration(), sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {
diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java
index f4c540839..d5b99a077 100644
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@@ -79,8 +79,16 @@ public final class QueryParams {
}
}
- private static final CollectionSchema[] defaultfacetfields = new CollectionSchema[]{
- CollectionSchema.host_s, CollectionSchema.url_protocol_s, CollectionSchema.url_file_ext_s, CollectionSchema.author_sxt};
+ private static final Map defaultfacetfields = new HashMap();
+ static {
+ // the key shall match with configuration property search.navigation
+ defaultfacetfields.put("location", CollectionSchema.coordinate_p);
+ defaultfacetfields.put("hosts", CollectionSchema.host_s);
+ defaultfacetfields.put("protocol", CollectionSchema.url_protocol_s);
+ defaultfacetfields.put("filetype", CollectionSchema.url_file_ext_s);
+ defaultfacetfields.put("authors", CollectionSchema.author_sxt);
+ //missing: namespace
+ }
private static final int defaultmaxfacets = 30;
private static final String ampersand = "&";
@@ -132,7 +140,8 @@ public final class QueryParams {
final Bitfield constraint,
final Segment indexSegment,
final RankingProfile ranking,
- final String userAgent) {
+ final String userAgent,
+ final String[] search_navigation) {
this.queryGoal = new QueryGoal(query_original, query_words);
this.ranking = ranking;
this.modifier = new QueryModifier();
@@ -169,8 +178,9 @@ public final class QueryParams {
this.facetfields = new LinkedHashSet();
this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
- for (CollectionSchema f: defaultfacetfields) {
- if (solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
+ for (String navkey: search_navigation) {
+ CollectionSchema f = defaultfacetfields.get(navkey);
+ if (f != null && solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
}
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
this.maxfacets = defaultmaxfacets;
@@ -205,7 +215,8 @@ public final class QueryParams {
final boolean filterscannerfail,
final double lat,
final double lon,
- final double radius
+ final double radius,
+ final String[] search_navigation
) {
this.queryGoal = queryGoal;
this.modifier = modifier;
@@ -269,8 +280,9 @@ public final class QueryParams {
this.facetfields = new LinkedHashSet();
this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
- for (CollectionSchema f: defaultfacetfields) {
- if (solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
+ for (String navkey: search_navigation) {
+ CollectionSchema f = defaultfacetfields.get(navkey);
+ if (f != null && solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
}
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
this.maxfacets = defaultmaxfacets;
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index a34300d82..bc1a5a584 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -136,6 +136,7 @@ public final class SearchEvent {
private Thread localsolrsearch;
private int localsolroffset;
private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons
+ public final ScoreMap locationNavigator; // a counter for the appearance of location coordinates
public final ScoreMap hostNavigator; // a counter for the appearance of host names
public final ScoreMap authorNavigator; // a counter for the appearances of authors
public final ScoreMap namespaceNavigator; // a counter for name spaces
@@ -225,6 +226,7 @@ public final class SearchEvent {
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
// prepare configured search navigation
final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", "");
+ this.locationNavigator = navcfg.contains("location") ? new ConcurrentScoreMap() : null;
this.authorNavigator = navcfg.contains("authors") ? new ConcurrentScoreMap() : null;
this.namespaceNavigator = navcfg.contains("namespace") ? new ConcurrentScoreMap() : null;
this.hostNavigator = navcfg.contains("hosts") ? new ConcurrentScoreMap() : null;
@@ -741,6 +743,17 @@ public final class SearchEvent {
// collect navigation information
ReversibleScoreMap fcts;
+ if (this.locationNavigator != null) {
+ fcts = facets.get(CollectionSchema.coordinate_p.getSolrFieldName());
+ if (fcts != null) {
+ for (String coordinate: fcts) {
+ int hc = fcts.get(coordinate);
+ if (hc == 0) continue;
+ this.locationNavigator.inc(coordinate, hc);
+ }
+ }
+ }
+
if (this.hostNavigator != null) {
fcts = facets.get(CollectionSchema.host_s.getSolrFieldName());
if (fcts != null) {
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index e36eda46a..73e45874a 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -84,8 +84,8 @@ import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.SolrInputField;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@@ -169,52 +169,33 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
}
- /**
- * Convert a SolrDocument to a SolrInputDocument.
- * This is useful if a document from the search index shall be modified and indexed again.
- * This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
- * which are created automatically during the indexing process.
- * @param doc the solr document
- * @return a solr input document
- */
public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
- SolrInputDocument sid = new SolrInputDocument();
- for (String name: doc.getFieldNames()) {
- if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema
- sid.addField(name, doc.getFieldValue(name), 1.0f);
- }
- }
- return sid;
+ return toSolrInputDocument(doc, omitFields);
}
public SolrDocument toSolrDocument(final SolrInputDocument doc) {
- SolrDocument sd = new SolrDocument();
- for (SolrInputField field: doc) {
- if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema
- sd.setField(field.getName(), field.getValue());
- }
- }
- return sd;
+ return toSolrDocument(doc, omitFields);
}
/**
* add uri attributes to solr document
* @param doc
* @param allAttr
- * @param digestURI
+ * @param digestURL
* @param doctype
* @return the normalized url
*/
- public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURI, final char doctype) {
- add(doc, CollectionSchema.id, ASCII.String(digestURI.hash()));
- String us = digestURI.toNormalform(true);
+ public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL, final char doctype) {
+ add(doc, CollectionSchema.id, ASCII.String(digestURL.hash()));
+ if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash());
+ String us = digestURL.toNormalform(true);
add(doc, CollectionSchema.sku, us);
if (allAttr || contains(CollectionSchema.ip_s)) {
- final InetAddress address = digestURI.getInetAddress();
+ final InetAddress address = digestURL.getInetAddress();
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
}
String host = null;
- if ((host = digestURI.getHost()) != null) {
+ if ((host = digestURL.getHost()) != null) {
String dnc = Domains.getDNC(host);
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
int p = subdomOrga.lastIndexOf('.');
@@ -228,17 +209,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// path elements of link
- String filename = digestURI.getFileName();
+ String filename = digestURL.getFileName();
String extension = MultiProtocolURL.getFileExtension(filename);
if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length());
- if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol());
- if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
+ if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol());
+ if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURL.getPaths());
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype));
- Map searchpart = digestURI.getSearchpartMap();
+ Map searchpart = digestURL.getSearchpartMap();
if (searchpart == null) {
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0);
} else {
@@ -309,7 +290,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate());
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate());
- if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash()));
if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5());
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
@@ -357,27 +337,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
public SolrVector yacy2solr(
- final String id, final Map collections, final ResponseHeader responseHeader,
+ final Map collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
final IndexCell citations,
- final WebgraphConfiguration webgraph) {
+ final WebgraphConfiguration webgraph, final String sourceName) {
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
- final DigestURL digestURI = document.dc_source();
+ final DigestURL digestURL = document.dc_source();
+ final String id = ASCII.String(digestURL.hash());
boolean allAttr = this.isEmpty();
- String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
+ String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
Set processTypes = new LinkedHashSet();
- add(doc, CollectionSchema.id, id);
- String us = digestURI.toNormalform(true);
+ String us = digestURL.toNormalform(true);
int clickdepth = 999;
if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
- if (digestURI.probablyRootURL()) {
- boolean lc = this.lazy; this.lazy = false;
+ if (digestURL.probablyRootURL()) {
clickdepth = 0;
- this.lazy = lc;
} else {
clickdepth = 999;
}
@@ -693,7 +671,23 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// canonical tag
if (allAttr || contains(CollectionSchema.canonical_s)) {
- final DigestURL canonical = html.getCanonical();
+ DigestURL canonical = html.getCanonical();
+ // if there is no canonical in the html then look into the http header:
+ if (canonical == null) {
+ String link = responseHeader.get("Link", null);
+ int p;
+ if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) {
+ link = link.substring(0, p).trim();
+ p = link.indexOf('<');
+ int q = link.lastIndexOf('>');
+ if (p >= 0 && q > 0) {
+ link = link.substring(p + 1, q);
+ try {
+ canonical = new DigestURL(link);
+ } catch (MalformedURLException e) {}
+ }
+ }
+ }
if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
containsCanonical = true;
inboundLinks.remove(canonical);
@@ -712,7 +706,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (refresh != null && refresh.length() > 0) {
MultiProtocolURL refreshURL;
try {
- refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURI, html.getRefreshPath());
+ refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath());
if (refreshURL != null) {
inboundLinks.remove(refreshURL);
outboundLinks.remove(refreshURL);
@@ -785,7 +779,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
String content = document.getTextString();
- String tokens = digestURI.toTokens();
+ String tokens = digestURL.toTokens();
if (content == null || content.length() == 0) {
content = tokens;
} else {
@@ -798,9 +792,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
- if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURI.getFileName()))) {
+ if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
- content = digestURI.toTokens(); // remove all other entry but the url tokens
+ content = digestURL.toTokens(); // remove all other entry but the url tokens
}
// content (must be written after special parser data, since this can influence the content)
@@ -824,7 +818,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// create a subgraph
if (!containsCanonical) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
- webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations);
+ webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations, sourceName);
}
// list all links
@@ -850,7 +844,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength());
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate);
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
- if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash()));
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher());
@@ -874,6 +867,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
List p = new ArrayList();
for (ProcessType t: processTypes) p.add(t.name());
add(doc, CollectionSchema.process_sxt, p);
+ if (allAttr || contains(CollectionSchema.harvestkey_s)) {
+ add(doc, CollectionSchema.harvestkey_s, sourceName);
+ }
}
return doc;
}
@@ -885,16 +881,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param urlCitation
* @return
*/
- public int postprocessing(final Segment segment) {
+ public int postprocessing(final Segment segment, String harvestkey) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation()) return 0;
- SolrConnector connector = segment.fulltext().getDefaultConnector();
- connector.commit(true); // make sure that we have latest information that can be found
+ SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
+ SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
+ collectionConnector.commit(true); // make sure that we have latest information that can be found
ReferenceReportCache rrCache = segment.getReferenceReportCache();
Map ranking = new TreeMap(Base64Order.enhancedCoder);
+ ReversibleScoreMap hostscore = null;
try {
// collect hosts from index which shall take part in citation computation
- ReversibleScoreMap hostscore = connector.getFacets(CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(), 10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
+ hostscore = collectionConnector.getFacets(
+ (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
+ CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
+ 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap();
// for each host, do a citation rank computation
for (String host: hostscore.keyList(true)) {
@@ -912,11 +913,49 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
ranking.putAll(crn); // accumulate this here for usage in document update later
}
} catch (final IOException e2) {
+ hostscore = new ClusteredScoreMap();
}
- // process all documents
- BlockingQueue docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
+ // process all documents at the webgraph for the outgoing links of this document
SolrDocument doc;
+ if (webgraphConnector != null) {
+ for (String host: hostscore.keyList(true)) {
+ if (hostscore.get(host) <= 0) continue;
+ // select all webgraph edges and modify their cr value
+ BlockingQueue docs = webgraphConnector.concurrentDocumentsByQuery(
+ WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
+ 0, 10000000, 60000, 50);
+ try {
+ while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+ boolean changed = false;
+ SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
+ byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
+ CRV crv = ranking.get(id);
+ if (crv != null) {
+ sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
+ changed = true;
+ }
+ id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
+ crv = ranking.get(id);
+ if (crv != null) {
+ sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
+ changed = true;
+ }
+ if (changed) try {
+ webgraphConnector.add(sid);
+ } catch (SolrException e) {
+ } catch (IOException e) {
+ }
+ }
+ } catch (final InterruptedException e) {}
+ }
+ }
+
+ // process all documents in collection
+ BlockingQueue docs = collectionConnector.concurrentDocumentsByQuery(
+ (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
+ CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
+ 0, 10000, 60000, 50);
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id
Set uniqueURLs = new HashSet();
@@ -964,12 +1003,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++;
- // all processing steps checked, remove the processing tag
+ // all processing steps checked, remove the processing and harvesting key
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
+ sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
// send back to index
//connector.deleteById(ASCII.String(id));
- connector.add(sid);
+ collectionConnector.add(sid);
+
proccount++;
} catch (final Throwable e1) {
}
@@ -1269,6 +1310,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
configuration.add(doc, CollectionSchema.collection_sxt, cs);
}
+
+ // clickdepth, cr and postprocessing
+ Set processTypes = new LinkedHashSet();
+ if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) {
+ processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
+ CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index
+ }
+ if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) {
+ processTypes.add(ProcessType.CITATION); // postprocessing needed
+ }
+ if (allAttr || configuration.contains(CollectionSchema.process_sxt)) {
+ List p = new ArrayList();
+ for (ProcessType t: processTypes) p.add(t.name());
+ configuration.add(doc, CollectionSchema.process_sxt, p);
+ }
return doc;
}
diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java
index 9b4009717..ccd75f343 100644
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@@ -59,6 +59,7 @@ public enum CollectionSchema implements SchemaDeclaration {
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
+ harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// optional but recommended, part of index distribution
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
@@ -231,6 +232,23 @@ public enum CollectionSchema implements SchemaDeclaration {
this.omitNorms = omitNorms;
this.searchable = searchable;
this.comment = comment;
+ // verify our naming scheme
+ String name = this.name();
+ int p = name.indexOf('_');
+ if (p > 0) {
+ String ext = name.substring(p + 1);
+ assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name;
+ assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name;
+ assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name;
+ assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
+ assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
+ assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
+ assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
+ assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
+ assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;
+ assert !ext.equals("val") || (type == SolrType.num_integer && multiValued) : name;
+ assert !ext.equals("d") || (type == SolrType.num_double && !multiValued) : name;
+ }
assert type.appropriateName(this) : "bad configuration: " + this.name();
}
diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java
index 0faa2f780..09667711f 100644
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@@ -117,7 +117,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map collections, int clickdepth_source,
final List images, final boolean inbound, final Collection links,
- final IndexCell citations) {
+ final IndexCell citations, final String sourceName) {
boolean allAttr = this.isEmpty();
int target_order = 0;
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
@@ -284,6 +284,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
List pr = new ArrayList();
for (ProcessType t: processTypes) pr.add(t.name());
add(edge, WebgraphSchema.process_sxt, pr);
+ if (allAttr || contains(CollectionSchema.harvestkey_s)) {
+ add(edge, CollectionSchema.harvestkey_s, sourceName);
+ }
}
// add the edge to the subgraph
@@ -291,7 +294,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
}
- public int postprocessing(Segment segment) {
+ public int postprocessing(final Segment segment, final String harvestkey) {
if (!this.contains(WebgraphSchema.process_sxt)) return 0;
if (!segment.connectedCitation()) return 0;
if (!segment.fulltext().writeToWebgraph()) return 0;
@@ -299,7 +302,10 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
// that means we must search for those entries.
connector.commit(true); // make sure that we have latest information that can be found
//BlockingQueue docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
- BlockingQueue docs = connector.concurrentDocumentsByQuery(WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 100000, 60000, 50);
+ BlockingQueue docs = connector.concurrentDocumentsByQuery(
+ (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
+ WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
+ 0, 100000, 60000, 50);
SolrDocument doc;
String protocol, urlstub, id;
@@ -335,6 +341,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
// all processing steps checked, remove the processing tag
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
+ sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
// send back to index
connector.add(sid);
diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java
index 096a15d9a..f5f0f3700 100644
--- a/source/net/yacy/search/schema/WebgraphSchema.java
+++ b/source/net/yacy/search/schema/WebgraphSchema.java
@@ -36,6 +36,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
+ harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// source information
source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),
@@ -51,6 +52,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
+ source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),
source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"),
@@ -85,6 +87,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
+ target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),
target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),
@@ -114,6 +117,23 @@ public enum WebgraphSchema implements SchemaDeclaration {
this.omitNorms = omitNorms;
this.searchable = searchable;
this.comment = comment;
+ // verify our naming scheme
+ String name = this.name();
+ int p = name.indexOf('_');
+ if (p > 0) {
+ String ext = name.substring(p + 1);
+ assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name;
+ assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name;
+ assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name;
+ assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
+ assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
+ assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
+ assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
+ assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
+ assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;
+ assert !ext.equals("val") || (type == SolrType.num_integer && multiValued) : name;
+ assert !ext.equals("d") || (type == SolrType.num_double && !multiValued) : name;
+ }
assert type.appropriateName(this) : "bad configuration: " + this.name();
}
diff --git a/source/net/yacy/server/http/TemplateEngine.java b/source/net/yacy/server/http/TemplateEngine.java
index 41074eb83..e0cac55fd 100644
--- a/source/net/yacy/server/http/TemplateEngine.java
+++ b/source/net/yacy/server/http/TemplateEngine.java
@@ -391,7 +391,6 @@ public final class TemplateEngine {
// #%
} else if ((bb & 0xFF) == pcChar) { //include
- final ByteBuffer include = new ByteBuffer();
keyStream.reset(); //reset stream
if(transferUntil(pis, keyStream, iClose)){
byte[] filename = keyStream.toByteArray();
@@ -403,6 +402,7 @@ public final class TemplateEngine {
filename= replacePattern(patternkey, pattern, dflt);
}
if (filename.length > 0 && !java.util.Arrays.equals(filename, dflt)) {
+ final ByteBuffer include = new ByteBuffer();
BufferedReader br = null;
try{
//br = new BufferedReader(new InputStreamReader(new FileInputStream( filename ))); //Simple Include
@@ -422,9 +422,9 @@ public final class TemplateEngine {
structure.append(ASCII.getBytes("\n"));
+ include.close();
}
}
-
// # - no special character. This is simply a '#' without meaning
} else { //no match, but a single hash (output # + bb)
out.write(hashChar);
diff --git a/source/net/yacy/server/serverSwitch.java b/source/net/yacy/server/serverSwitch.java
index 67efd230a..2faa72b66 100644
--- a/source/net/yacy/server/serverSwitch.java
+++ b/source/net/yacy/server/serverSwitch.java
@@ -63,7 +63,7 @@ public class serverSwitch
public final File dataPath;
public final File appPath;
protected boolean firstInit;
- protected ConcurrentLog log;
+ public ConcurrentLog log;
protected int serverJobs;
private ConcurrentMap configProps;
private final ConcurrentMap configRemoved;