merge with rc1/master

pull/1/head
reger 12 years ago
commit c7c706fd9f

@ -56,14 +56,14 @@
<string>$JAVAROOT/lib/commons-io-2.1.jar</string>
<string>$JAVAROOT/lib/commons-jxpath-1.3.jar</string>
<string>$JAVAROOT/lib/commons-lang-2.6.jar</string>
<string>$JAVAROOT/lib/commons-logging-1.1.1.jar</string>
<string>$JAVAROOT/lib/commons-logging-1.1.3.jar</string>
<string>$JAVAROOT/lib/fontbox-1.8.2.jar</string>
<string>$JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar</string>
<string>$JAVAROOT/lib/guava-13.0.1.jar</string>
<string>$JAVAROOT/lib/htmllexer.jar</string>
<string>$JAVAROOT/lib/httpclient-4.2.5.jar</string>
<string>$JAVAROOT/lib/httpcore-4.2.4.jar</string>
<string>$JAVAROOT/lib/httpmime-4.2.5.jar</string>
<string>$JAVAROOT/lib/httpclient-4.3.jar</string>
<string>$JAVAROOT/lib/httpcore-4.3.jar</string>
<string>$JAVAROOT/lib/httpmime-4.3.jar</string>
<string>$JAVAROOT/lib/icu4j-core.jar</string>
<string>$JAVAROOT/lib/iri-0.8.jar</string>
<string>$JAVAROOT/lib/J7Zip-modified.jar</string>

@ -90,6 +90,9 @@ clickdepth_i
## needed (post-)processing steps on this metadata set
process_sxt
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
harvestkey_s
### optional but highly recommended values, part of the index distribution process

@ -26,6 +26,9 @@ collection_sxt
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
#process_sxt
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
harvestkey_s
##
@ -71,6 +74,10 @@ source_id_s
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
#source_clickdepth_i
## copy of the citation rank norm value from the source link
source_cr_host_norm_i
## host of the url (source)
#source_host_s
@ -168,6 +175,10 @@ target_path_folders_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
#target_clickdepth_i
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
target_cr_host_norm_i
## host of the url (target)
#target_host_s

@ -772,7 +772,7 @@ search.result.show.tags = false
# search navigators: comma-separated list of default values for search navigation.
# can be temporary different if search string is given with differen navigation values
# assigning no value(s) means that no navigation is shown
search.navigation=hosts,authors,namespace,topics,filetype,protocol
search.navigation=location,hosts,authors,namespace,topics,filetype,protocol
# search result verification and snippet fetch caching rules
# each search result can be verified byloading the link from the web

@ -105,6 +105,7 @@ public class CrawlProfileEditor_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes());
sb.crawler.removePassive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);

@ -129,6 +129,7 @@ public class Crawler_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes());
sb.crawler.removePassive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);

@ -316,10 +316,18 @@ public class DictionaryLoader_p {
}
// check status again
boolean keepPlacesTagging = false;
for (final LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) {
prop.put(dictionary.nickname + "Status", dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0);
int newstatus = dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0;
if (newstatus == 1) keepPlacesTagging = true;
prop.put(dictionary.nickname + "Status", newstatus);
}
// if all locations are deleted or deactivated, remove also the vocabulary
if (!keepPlacesTagging) {
LibraryProvider.autotagging.removePlaces();
}
return prop; // return rewrite values for templates
}
}

@ -635,7 +635,8 @@ public class IndexControlRWIs_p {
"",//userAgent
false,
false,
0.0d, 0.0d, 0.0d);
0.0d, 0.0d, 0.0d,
new String[0]);
final SearchEvent theSearch = SearchEventCache.getEvent(query, sb.peers, sb.tables, null, false, sb.loader, Integer.MAX_VALUE, Long.MAX_VALUE, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));
if (theSearch.rwiProcess != null && theSearch.rwiProcess.isAlive()) try {theSearch.rwiProcess.join();} catch (final InterruptedException e) {}
if (theSearch.local_rwi_available.get() == 0) {

@ -69,7 +69,10 @@ public class IndexCreateQueues_p {
entry = sb.crawler.getActive(handle);
final String name = entry.name();
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(name)) continue;
if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
if (compiledPattern.matcher(name).find()) {
sb.crawler.removeActive(entry.handle().getBytes());
sb.crawler.removePassive(entry.handle().getBytes());
}
}
} else {
// iterating through the list of URLs

@ -251,7 +251,8 @@ public final class search {
false,
0.0d,
0.0d,
0.0d
0.0d,
new String[0]
);
Network.log.info("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links");
@ -315,7 +316,8 @@ public final class search {
false,
0.0d,
0.0d,
0.0d
0.0d,
new String[0]
);
Network.log.info("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links");
EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()), ""));

@ -668,7 +668,8 @@ public class yacysearch {
&& sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false)
&& sb.peers.mySeed().getFlagAcceptRemoteIndex(),
false,
lat, lon, rad);
lat, lon, rad,
sb.getConfig("search_navigation","").split(","));
EventTracker.delete(EventTracker.EClass.SEARCH);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
theQuery.id(true),

@ -26,6 +26,7 @@ import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.federate.opensearch.SRURSSConnector;
import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@ -93,7 +94,7 @@ public class yacysearch_location {
// get a queue of search results
final String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
final BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, lon == 0.0d && lat == 0.0d ? query : query + " /radius/" + lat + "/" + lon + "/" + radius, maximumTime, Integer.MAX_VALUE, null, false, null);
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, lon == 0.0d && lat == 0.0d ? query : query + " /radius/" + lat + "/" + lon + "/" + radius, maximumTime, Integer.MAX_VALUE, null, false, ClientIdentification.yacyInternetCrawlerAgent);
// take the results and compute some locations
RSSMessage message;

@ -387,7 +387,8 @@ public class yacysearchtrailer {
// category: location search
// show only if there is a location database present and if there had been any search results
if (LibraryProvider.geoLoc.isEmpty() || theSearch.getResultCount() == 0) {
if ((LibraryProvider.geoLoc.isEmpty() || theSearch.getResultCount() == 0) &&
(theSearch.locationNavigator == null || theSearch.locationNavigator.isEmpty())) {
prop.put("cat-location", 0);
} else {
prop.put("cat-location", 1);

Binary file not shown.

Binary file not shown.

@ -22,8 +22,8 @@ commons-io-2.1.jar
commons-lang-2.6.jar
geronimo-stax-api_1.0_spec-1.0.1.jar
guava-r05.jar
httpclient-4.2.3.jar
httpcore-4.2.3.jar
httpclient-4.3.jar
httpcore-4.3.jar
jcl-over-slf4j-1.6.1.jar
log4j-over-slf4j-1.6.1.jar
lucene-analyzers-3.6.0.jar

Binary file not shown.

Binary file not shown.

@ -1,240 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
=========================================================================
This project contains annotations in the package org.apache.http.annotation
which are derived from JCIP-ANNOTATIONS
Copyright (c) 2005 Brian Goetz and Tim Peierls.
See http://www.jcip.net and the Creative Commons Attribution License
(http://creativecommons.org/licenses/by/2.5)
Full text: http://creativecommons.org/licenses/by/2.5/legalcode
License
THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
1. Definitions
"Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.
"Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered a Derivative Work for the purpose of this License.
"Licensor" means the individual or entity that offers the Work under the terms of this License.
"Original Author" means the individual or entity who created the Work.
"Work" means the copyrightable work of authorship offered under the terms of this License.
"You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.
3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:
to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;
to create and reproduce Derivative Works;
to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;
to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.
For the avoidance of doubt, where the work is a musical composition:
Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.
Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights agency or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work ("cover version") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).
Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).
The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.
4. Restrictions.The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any credit as required by clause 4(b), as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any credit as required by clause 4(b), as requested.
If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or (ii) if the Original Author and/or Licensor designate another party or parties (e.g. a sponsor institute, publishing entity, journal) for attribution in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.
5. Representations, Warranties and Disclaimer
UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
7. Termination
This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
8. Miscellaneous
Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.

Binary file not shown.

@ -1,202 +1,176 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

Binary file not shown.

Binary file not shown.

@ -0,0 +1,176 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

Binary file not shown.

@ -73,7 +73,7 @@
<compilation-unit>
<package-root>source</package-root>
<package-root>htroot</package-root>
<classpath mode="compile">lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.1.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-13.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.2.3.jar;lib/httpcore-4.2.3.jar;lib/httpmime-4.2.3.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.7.1.jar;lib/jena-2.6.4.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.17.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.2.1.jar;lib/lucene-analyzers-phonetic-4.2.1.jar;lib/lucene-core-4.2.1.jar;lib/lucene-misc-4.2.1.jar;lib/lucene-spatial-4.2.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.1.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.7.2.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.2.1.jar;lib/solr-solrj-4.2.1.jar;lib/spatial4j-0.3.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.5.jar</classpath>
<classpath mode="compile">lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.3.jar;lib/fontbox-1.7.1.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-13.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.3.jar;lib/httpcore-4.3.jar;lib/httpmime-4.3.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.7.1.jar;lib/jena-2.6.4.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.17.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.2.1.jar;lib/lucene-analyzers-phonetic-4.2.1.jar;lib/lucene-core-4.2.1.jar;lib/lucene-misc-4.2.1.jar;lib/lucene-spatial-4.2.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.1.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.7.2.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.2.1.jar;lib/solr-solrj-4.2.1.jar;lib/spatial4j-0.3.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.5.jar</classpath>
<source-level>1.6</source-level>
</compilation-unit>
</java-data>

@ -24,10 +24,10 @@
package net.yacy.cora.document.encoding;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Comparator;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.content.StringBody;
/**
@ -45,6 +45,7 @@ public class UTF8 implements Comparator<String> {
static {
charset = Charset.forName("UTF-8");
}
private final static ContentType contentType = ContentType.TEXT_PLAIN.withCharset(charset);
public static final UTF8 insensitiveUTF8Comparator = new UTF8(true);
public static final UTF8 identityUTF8Comparator = new UTF8(false);
@ -103,12 +104,7 @@ public class UTF8 implements Comparator<String> {
}
public final static StringBody StringBody(final String s) {
try {
return new StringBody(s == null ? "" : s, charset);
} catch (final UnsupportedEncodingException e) {
e.printStackTrace();
return null;
}
return new StringBody(s == null ? "" : s, contentType);
}
/**

@ -48,7 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
public class DigestURL extends MultiProtocolURL implements Serializable {
private static final long serialVersionUID = -1173233022912141885L;
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
// class variables
private byte[] hash;

@ -24,6 +24,8 @@ import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import org.openjena.atlas.logging.Log;
import net.yacy.cora.util.CommonPattern;
import net.yacy.search.schema.CollectionSchema;
@ -75,16 +77,22 @@ public class Ranking {
* @param boostDef the definition string
*/
public void updateBoosts(String boostDef) {
// call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
// call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description_txt^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
if (boostDef == null || boostDef.length() == 0) return;
String[] bf = CommonPattern.COMMA.split(boostDef);
this.fieldBoosts.clear();
for (String boost: bf) {
int p = boost.indexOf('^');
if (p < 0) continue;
CollectionSchema field = CollectionSchema.valueOf(boost.substring(0, p));
Float factor = Float.parseFloat(boost.substring(p + 1));
this.fieldBoosts.put(field, factor);
String boostkey = boost.substring(0, p);
try {
CollectionSchema field = CollectionSchema.valueOf(boostkey);
Float factor = Float.parseFloat(boost.substring(p + 1));
this.fieldBoosts.put(field, factor);
} catch (IllegalArgumentException e) {
// boostkey is unknown; ignore it but print warning
Log.warn("Ranking", "unknwon boost key '" + boostkey + "'");
}
}
}

@ -33,6 +33,7 @@ import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
@ -78,6 +79,34 @@ public class SchemaConfiguration extends Configuration implements Serializable {
}
}
/**
* Convert a SolrDocument to a SolrInputDocument.
* This is useful if a document from the search index shall be modified and indexed again.
* This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process.
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(final SolrDocument doc, Set<String> omitFields) {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
}
}
return sid;
}
public SolrDocument toSolrDocument(final SolrInputDocument doc, Set<String> omitFields) {
SolrDocument sd = new SolrDocument();
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && (omitFields == null || !omitFields.contains(field.getName()))) { // check each field if enabled in local Solr schema
sd.setField(field.getName(), field.getValue());
}
}
return sd;
}
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT

@ -46,24 +46,17 @@ import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.HttpContext;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
@SuppressWarnings("deprecation") //TODO: switch to 4.3-Stuff
public class RemoteInstance implements SolrInstance {
private String solrurl;
private final DefaultHttpClient client;
private final org.apache.http.impl.client.DefaultHttpClient client;
// 4.3 private final CloseableHttpClient client;
private final String defaultCoreName;
private final HttpSolrServer defaultServer;
private final Collection<String> coreNames;
@ -133,25 +126,73 @@ public class RemoteInstance implements SolrInstance {
}
}
if (solraccount.length() > 0) {
PoolingClientConnectionManager cm = new PoolingClientConnectionManager(); // try also: ThreadSafeClientConnManager
// 4.3:
// final PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
// cm.setMaxTotal(100);
//
// final RequestConfig.Builder reqBuilder = RequestConfig.custom();
// reqBuilder.setSocketTimeout(timeout);
// reqBuilder.setConnectTimeout(timeout);
// reqBuilder.setConnectionRequestTimeout(timeout);
//
// final BasicCredentialsProvider credsProvider = new BasicCredentialsProvider();
// credsProvider.setCredentials(new AuthScope(host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(solraccount, solrpw));
//
// final HttpClientBuilder builder = HttpClientBuilder.create();
// builder.setConnectionManager(cm);
// builder.setDefaultRequestConfig(reqBuilder.build());
// builder.setDefaultCredentialsProvider(credsProvider);
// builder.disableAutomaticRetries(); // no retries needed; we expect connections to fail; therefore we should not retry
// // ask for gzip - why not using net.yacy.cora.protocol.http.GzipRequestInterceptor?
// builder.addInterceptorLast(new HttpRequestInterceptor() {
// @Override
// public void process(final HttpRequest request, final HttpContext context) throws IOException {
// if (!request.containsHeader("Accept-Encoding")) request.addHeader("Accept-Encoding", "gzip");
// if (!request.containsHeader("Connection")) request.addHeader("Connection", "close"); // prevent CLOSE_WAIT
// }
//
// });
// // uncompress gzip - why not using net.yacy.cora.protocol.http.GzipResponseInterceptor?
// builder.addInterceptorLast(new HttpResponseInterceptor() {
// @Override
// public void process(final HttpResponse response, final HttpContext context) throws IOException {
// HttpEntity entity = response.getEntity();
// if (entity != null) {
// Header ceheader = entity.getContentEncoding();
// if (ceheader != null) {
// HeaderElement[] codecs = ceheader.getElements();
// for (HeaderElement codec : codecs) {
// if (codec.getName().equalsIgnoreCase("gzip")) {
// response.setEntity(new GzipDecompressingEntity(response.getEntity()));
// return;
// }
// }
// }
// }
// }
// });
// this.client = builder.build();
// old Stuff START
org.apache.http.impl.conn.PoolingClientConnectionManager cm = new org.apache.http.impl.conn.PoolingClientConnectionManager(); // try also: ThreadSafeClientConnManager
cm.setMaxTotal(100);
this.client = new DefaultHttpClient(cm) {
this.client = new org.apache.http.impl.client.DefaultHttpClient(cm) {
@Override
protected HttpContext createHttpContext() {
HttpContext context = super.createHttpContext();
AuthCache authCache = new BasicAuthCache();
AuthCache authCache = new org.apache.http.impl.client.BasicAuthCache();
BasicScheme basicAuth = new BasicScheme();
HttpHost targetHost = new HttpHost(u.getHost(), u.getPort(), u.getProtocol());
authCache.put(targetHost, basicAuth);
context.setAttribute(ClientContext.AUTH_CACHE, authCache);
this.setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false)); // no retries needed; we expect connections to fail; therefore we should not retry
context.setAttribute(org.apache.http.client.protocol.ClientContext.AUTH_CACHE, authCache);
this.setHttpRequestRetryHandler(new org.apache.http.impl.client.DefaultHttpRequestRetryHandler(0, false)); // no retries needed; we expect connections to fail; therefore we should not retry
return context;
}
};
HttpParams params = this.client.getParams();
HttpConnectionParams.setConnectionTimeout(params, timeout);
HttpConnectionParams.setSoTimeout(params, timeout);
org.apache.http.params.HttpParams params = this.client.getParams();
org.apache.http.params.HttpConnectionParams.setConnectionTimeout(params, timeout);
org.apache.http.params.HttpConnectionParams.setSoTimeout(params, timeout);
this.client.addRequestInterceptor(new HttpRequestInterceptor() {
@Override
public void process(final HttpRequest request, final HttpContext context) throws IOException {
@ -178,9 +219,10 @@ public class RemoteInstance implements SolrInstance {
}
}
});
BasicCredentialsProvider credsProvider = new BasicCredentialsProvider();
org.apache.http.impl.client.BasicCredentialsProvider credsProvider = new org.apache.http.impl.client.BasicCredentialsProvider();
credsProvider.setCredentials(new AuthScope(host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(solraccount, solrpw));
this.client.setCredentialsProvider(credsProvider);
// old Stuff END
} else {
this.client = null;
}
@ -248,7 +290,14 @@ public class RemoteInstance implements SolrInstance {
@Override
public void close() {
if (this.client != null) this.client.getConnectionManager().shutdown();
if (this.client != null) this.client.getConnectionManager().shutdown();
// 4.3
// if (this.client != null)
// try {
// this.client.close();
// } catch (final IOException e) {
// // TODO Auto-generated catch block
// }
}
}

@ -110,7 +110,7 @@ public class AutotaggingLibrary {
}
public void addPlaces(Locations locations) {
if (locations.isEmpty()) return; // otherwise we get a navigation that does nothing
if (locations.isEmpty()) return; // otherwise we get a navigation that does nothing
Tagging voc = new Tagging("Locations", locations);
try {
voc.setObjectspace("http://dbpedia.org/resource/");
@ -122,6 +122,10 @@ public class AutotaggingLibrary {
}
}
public void removePlaces() {
this.vocabularies.remove("Locations");
}
public int size() {
return this.vocabularies.size();
}

@ -28,6 +28,7 @@ package net.yacy.cora.protocol;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.MIME;
import org.apache.http.entity.mime.content.AbstractContentBody;
@ -42,7 +43,7 @@ public class ByteArrayBody extends AbstractContentBody {
* @param filename
*/
public ByteArrayBody(final byte[] bytes, final String filename) {
super("application/octet-stream");
super(ContentType.APPLICATION_OCTET_STREAM);
this.bytes = bytes;
this.filename = filename;
}

@ -74,8 +74,8 @@ public class Domains {
private static final String PRESENT = "";
private static final Pattern LOCAL_PATTERNS = Pattern.compile("(10\\..*)|(127\\..*)|(172\\.(1[6-9]|2[0-9]|3[0-1])\\..*)|(169\\.254\\..*)|(192\\.168\\..*)|(localhost)|(\\[?\\:\\:1/.*)|(\\[?fc.*)|(\\[?fd.*)|(\\[?(fe80|0)\\:0\\:0\\:0\\:0\\:0\\:0\\:1.*)");
private static final int MAX_NAME_CACHE_HIT_SIZE = 100000;
private static final int MAX_NAME_CACHE_MISS_SIZE = 100000;
private static final int MAX_NAME_CACHE_HIT_SIZE = 10000;
private static final int MAX_NAME_CACHE_MISS_SIZE = 1000;
private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() * 2;
// a dns cache
@ -782,7 +782,7 @@ public class Domains {
public InetAddress call() throws Exception {
return InetAddress.getByName(host);
}
}, 1000L, TimeUnit.MILLISECONDS, false);
}, 3000L, TimeUnit.MILLISECONDS, false);
//ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
}
//.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms");

@ -28,12 +28,12 @@ package net.yacy.cora.protocol.http;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@ -49,7 +49,6 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.http.ProxySettings.Protocol;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
@ -58,41 +57,32 @@ import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.params.HttpClientParams;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.ConnectionKeepAliveStrategy;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.conn.DnsResolver;
import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.entity.InputStreamEntity;
import org.apache.http.entity.mime.MultipartEntity;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicHeaderElementIterator;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.ByteArrayBuffer;
@ -108,106 +98,126 @@ import org.apache.http.util.EntityUtils;
public class HTTPClient {
private final static int maxcon = 200;
private static IdledConnectionEvictor idledConnectionEvictor = null;
private static HttpClient httpClient = initConnectionManager();
private static final CredentialsProvider credsProvider = new BasicCredentialsProvider();
private static IdleConnectionMonitorThread connectionMonitor = null;
private final static RequestConfig dfltReqConf = initRequestConfig();
private final static HttpClientBuilder clientBuilder = initClientBuilder();
private final RequestConfig.Builder reqConfBuilder;
private Set<Entry<String, String>> headers = null;
private HttpResponse httpResponse = null;
private CloseableHttpResponse httpResponse = null;
private HttpUriRequest currentRequest = null;
private long upbytes = 0L;
private int timeout = 10000;
private ClientIdentification.Agent agent = null;
private String host = null;
private boolean redirecting = true;
private String realm = null;
public HTTPClient(final ClientIdentification.Agent agent) {
super();
this.agent = agent;
this.timeout = agent.clientTimeout;
HttpProtocolParams.setUserAgent(httpClient.getParams(), agent.userAgent);
clientBuilder.setUserAgent(agent.userAgent);
reqConfBuilder = RequestConfig.copy(dfltReqConf);
reqConfBuilder.setSocketTimeout(agent.clientTimeout);
reqConfBuilder.setConnectTimeout(agent.clientTimeout);
reqConfBuilder.setConnectionRequestTimeout(agent.clientTimeout);
}
public HTTPClient(final ClientIdentification.Agent agent, final int timeout) {
super();
this.agent = agent;
this.timeout = timeout;
HttpProtocolParams.setUserAgent(httpClient.getParams(), agent.userAgent);
clientBuilder.setUserAgent(agent.userAgent);
reqConfBuilder = RequestConfig.copy(dfltReqConf);
reqConfBuilder.setSocketTimeout(timeout);
reqConfBuilder.setConnectTimeout(timeout);
reqConfBuilder.setConnectionRequestTimeout(timeout);
}
public static void setDefaultUserAgent(final String defaultAgent) {
HttpProtocolParams.setUserAgent(httpClient.getParams(), defaultAgent);
clientBuilder.setUserAgent(defaultAgent);
}
public static HttpClient initConnectionManager() {
// Create and initialize scheme registry
final SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
schemeRegistry.register(new Scheme("https", 443, getSSLSocketFactory()));
final PoolingClientConnectionManager clientConnectionManager = new PoolingClientConnectionManager(schemeRegistry);
// Create and initialize HTTP parameters
final HttpParams httpParams = new BasicHttpParams();
/**
* ConnectionManager settings
*/
// how much connections do we need? - default: 20
clientConnectionManager.setMaxTotal(maxcon);
// for statistics same value should also be set here
ConnectionInfo.setMaxcount(maxcon);
// connections per host (2 default)
clientConnectionManager.setDefaultMaxPerRoute(2);
// Increase max connections for localhost
final HttpHost localhost = new HttpHost(Domains.LOCALHOST);
clientConnectionManager.setMaxPerRoute(new HttpRoute(localhost), maxcon);
/**
* HTTP protocol settings
*/
HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1);
// UserAgent
HttpProtocolParams.setUserAgent(httpParams, ClientIdentification.yacyInternetCrawlerAgent.userAgent);
HttpProtocolParams.setUseExpectContinue(httpParams, false); // IMPORTANT - if not set to 'false' then servers do not process the request until a time-out of 2 seconds
/**
* HTTP connection settings
*/
private static RequestConfig initRequestConfig() {
final RequestConfig.Builder builder = RequestConfig.custom();
// IMPORTANT - if not set to 'false' then servers do not process the request until a time-out of 2 seconds
builder.setExpectContinueEnabled(false);
// timeout in milliseconds until a connection is established in milliseconds
HttpConnectionParams.setConnectionTimeout(httpParams, 6000);
// SO_LINGER affects the socket close operation in seconds
// HttpConnectionParams.setLinger(httpParams, 6);
// HttpConnectionParams.setSocketBufferSize(httpParams, 8192);
builder.setConnectionRequestTimeout(6000);
builder.setConnectTimeout(8000);
// SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds
HttpConnectionParams.setSoTimeout(httpParams, 1000);
builder.setSocketTimeout(3000);
// getting an I/O error when executing a request over a connection that has been closed at the server side
HttpConnectionParams.setStaleCheckingEnabled(httpParams, true);
// conserve bandwidth by minimizing the number of segments that are sent
HttpConnectionParams.setTcpNoDelay(httpParams, false);
// Defines whether the socket can be bound even though a previous connection is still in a timeout state.
HttpConnectionParams.setSoReuseaddr(httpParams, true);
/**
* HTTP client settings
*/
builder.setStaleConnectionCheckEnabled(true);
// ignore cookies, cause this may cause segfaults in default cookiestore and is not needed
HttpClientParams.setCookiePolicy(httpParams, CookiePolicy.IGNORE_COOKIES);
httpClient = new DefaultHttpClient(clientConnectionManager, httpParams);
builder.setCookieSpec(CookieSpecs.IGNORE_COOKIES);
builder.setRedirectsEnabled(true);
builder.setRelativeRedirectsAllowed(true);
return builder.build();
}
private static HttpClientBuilder initClientBuilder() {
final HttpClientBuilder builder = HttpClientBuilder.create();
builder.setConnectionManager(initPoolingConnectionManager());
builder.setDefaultRequestConfig(dfltReqConf);
// UserAgent
builder.setUserAgent(ClientIdentification.yacyInternetCrawlerAgent.userAgent);
// remove retries; we expect connections to fail; therefore we should not retry
builder.disableAutomaticRetries();
// disable the cookiestore, cause this may cause segfaults and is not needed
((DefaultHttpClient) httpClient).setCookieStore(null);
builder.setDefaultCookieStore(null);
builder.disableCookieManagement();
// add cutom keep alive strategy
addCustomKeepAliveStrategy((DefaultHttpClient) httpClient);
builder.setKeepAliveStrategy(customKeepAliveStrategy());
// ask for gzip
((DefaultHttpClient) httpClient).addRequestInterceptor(new GzipRequestInterceptor());
builder.addInterceptorLast(new GzipRequestInterceptor());
// uncompress gzip
((DefaultHttpClient) httpClient).addResponseInterceptor(new GzipResponseInterceptor());
// remove retries; we expect connections to fail; therefore we should not retry
((DefaultHttpClient) httpClient).setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false));
if (idledConnectionEvictor == null) {
idledConnectionEvictor = new IdledConnectionEvictor(clientConnectionManager);
idledConnectionEvictor.start();
builder.addInterceptorLast(new GzipResponseInterceptor());
// Proxy
builder.setRoutePlanner(ProxySettings.RoutePlanner);
builder.setDefaultCredentialsProvider(ProxySettings.CredsProvider);
return builder;
}
private static PoolingHttpClientConnectionManager initPoolingConnectionManager() {
final PlainConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
final Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", plainsf)
.register("https", getSSLSocketFactory())
.build();
final PoolingHttpClientConnectionManager pooling = new PoolingHttpClientConnectionManager(registry, new DnsResolver(){
@Override
public InetAddress[] resolve(final String host0)throws UnknownHostException {
final InetAddress ip = Domains.dnsResolve(host0);
if (ip == null) throw new UnknownHostException(host0);
return new InetAddress[]{ip};
}});
// how much connections do we need? - default: 20
pooling.setMaxTotal(maxcon);
// for statistics same value should also be set here
ConnectionInfo.setMaxcount(maxcon);
// connections per host (2 default)
pooling.setDefaultMaxPerRoute(4);
// Increase max connections for localhost
final HttpHost localhost = new HttpHost(Domains.LOCALHOST);
pooling.setMaxPerRoute(new HttpRoute(localhost), maxcon);
final SocketConfig socketConfig = SocketConfig.custom()
// Defines whether the socket can be bound even though a previous connection is still in a timeout state.
.setSoReuseAddress(true)
// SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds
.setSoTimeout(3000)
// conserve bandwidth by minimizing the number of segments that are sent
.setTcpNoDelay(false)
.build();
pooling.setDefaultSocketConfig(socketConfig);
if (connectionMonitor == null) {
connectionMonitor = new IdleConnectionMonitorThread(pooling);
connectionMonitor.start();
}
return httpClient;
return pooling;
}
/**
@ -217,34 +227,29 @@ public class HTTPClient {
* @throws InterruptedException
*/
public static void closeConnectionManager() throws InterruptedException {
if (idledConnectionEvictor != null) {
if (connectionMonitor != null) {
// Shut down the evictor thread
idledConnectionEvictor.shutdown();
idledConnectionEvictor.join();
connectionMonitor.shutdown();
connectionMonitor.join();
}
if (httpClient != null) {
// Shut down the connection manager
httpClient.getConnectionManager().shutdown();
}
}
public static void setAuth(final String host, final int port, final String user, final String pw) {
final UsernamePasswordCredentials creds = new UsernamePasswordCredentials(user, pw);
final AuthScope scope = new AuthScope(host, port);
credsProvider.setCredentials(scope, creds);
httpClient.getParams().setParameter(ClientContext.CREDS_PROVIDER, credsProvider);
}
// public static void setAuth(final String host, final int port, final String user, final String pw) {
// final UsernamePasswordCredentials creds = new UsernamePasswordCredentials(user, pw);
// final AuthScope scope = new AuthScope(host, port);
// credsProvider.setCredentials(scope, creds);
// httpClient.getParams().setParameter(ClientContext.CREDS_PROVIDER, credsProvider);
// }
/**
* this method sets a host on which more than the default of 2 router per host are allowed
*
* @param the host to be raised in 'route per host'
*/
public static void setMaxRouteHost(final String host) {
final HttpHost mHost = new HttpHost(host);
((PoolingClientConnectionManager) httpClient.getConnectionManager()).setMaxPerRoute(new HttpRoute(mHost), 50);
}
// /**
// * this method sets a host on which more than the default of 2 router per host are allowed
// *
// * @param the host to be raised in 'route per host'
// */
// public static void setMaxRouteHost(final String host) {
// final HttpHost mHost = new HttpHost(host);
// ((PoolingClientConnectionManager) httpClient.getConnectionManager()).setMaxPerRoute(new HttpRoute(mHost), 50);
// }
/**
* This method sets the Header used for the request
@ -261,7 +266,9 @@ public class HTTPClient {
* @param timeout in milliseconds
*/
public void setTimout(final int timeout) {
this.timeout = timeout;
reqConfBuilder.setSocketTimeout(timeout);
reqConfBuilder.setConnectTimeout(timeout);
reqConfBuilder.setConnectionRequestTimeout(timeout);
}
/**
@ -270,7 +277,7 @@ public class HTTPClient {
* @param userAgent
*/
public void setUserAgent(final ClientIdentification.Agent agent) {
this.agent = agent;
clientBuilder.setUserAgent(agent.userAgent);
}
/**
@ -288,7 +295,8 @@ public class HTTPClient {
* @param redirecting
*/
public void setRedirecting(final boolean redirecting) {
this.redirecting = redirecting;
reqConfBuilder.setRedirectsEnabled(redirecting);
reqConfBuilder.setRelativeRedirectsAllowed(redirecting);
}
/**
@ -354,7 +362,7 @@ public class HTTPClient {
}
httpGet.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
if (!localhost) setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
return getContentBytes(httpGet, url.getHost(), maxBytes);
return getContentBytes(httpGet, maxBytes);
}
/**
@ -378,7 +386,7 @@ public class HTTPClient {
httpGet.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
this.currentRequest = httpGet;
execute(httpGet, url.getHost());
execute(httpGet);
}
/**
@ -393,7 +401,7 @@ public class HTTPClient {
final HttpHead httpHead = new HttpHead(url.toNormalform(true));
httpHead.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
execute(httpHead, url.getHost());
execute(httpHead);
finish();
ConnectionInfo.removeConnection(httpHead.hashCode());
return this.httpResponse;
@ -422,7 +430,7 @@ public class HTTPClient {
this.upbytes = length;
httpPost.setEntity(inputStreamEntity);
this.currentRequest = httpPost;
execute(httpPost, host);
execute(httpPost);
}
/**
@ -454,10 +462,11 @@ public class HTTPClient {
setHost(vhost); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
if (vhost == null) setHost(Domains.LOCALHOST);
final MultipartEntity multipartEntity = new MultipartEntity();
for (final Entry<String,ContentBody> part : post.entrySet())
multipartEntity.addPart(part.getKey(), part.getValue());
final MultipartEntityBuilder entityBuilder = MultipartEntityBuilder.create();
for (final Entry<String,ContentBody> part : post.entrySet())
entityBuilder.addPart(part.getKey(), part.getValue());
final HttpEntity multipartEntity = entityBuilder.build();
// statistics
this.upbytes = multipartEntity.getContentLength();
@ -467,7 +476,7 @@ public class HTTPClient {
httpPost.setEntity(multipartEntity);
}
return getContentBytes(httpPost, url.getHost(), Integer.MAX_VALUE);
return getContentBytes(httpPost, Integer.MAX_VALUE);
}
/**
@ -491,7 +500,7 @@ public class HTTPClient {
// statistics
this.upbytes = length;
httpPost.setEntity(inputStreamEntity);
return getContentBytes(httpPost, host, Integer.MAX_VALUE);
return getContentBytes(httpPost, Integer.MAX_VALUE);
}
/**
@ -580,9 +589,9 @@ public class HTTPClient {
}
}
private byte[] getContentBytes(final HttpUriRequest httpUriRequest, String host, final int maxBytes) throws IOException {
private byte[] getContentBytes(final HttpUriRequest httpUriRequest, final int maxBytes) throws IOException {
try {
execute(httpUriRequest, host);
execute(httpUriRequest);
if (this.httpResponse == null) return null;
// get the response body
final HttpEntity httpEntity = this.httpResponse.getEntity();
@ -602,11 +611,13 @@ public class HTTPClient {
}
}
private void execute(final HttpUriRequest httpUriRequest, String host) throws IOException {
final HttpContext httpContext = new BasicHttpContext();
private void execute(final HttpUriRequest httpUriRequest) throws IOException {
final HttpClientContext context = HttpClientContext.create();
context.setRequestConfig(reqConfBuilder.build());
if (this.host != null)
context.setTargetHost(new HttpHost(this.host));
setHeaders(httpUriRequest);
setParams(httpUriRequest.getParams());
setProxy(httpUriRequest.getParams(), host);
// statistics
storeConnectionInfo(httpUriRequest);
// execute the method; some asserts confirm that that the request can be send with Content-Length and is therefore not terminated by EOF
@ -620,14 +631,17 @@ public class HTTPClient {
}
Thread.currentThread().setName("HTTPClient-" + httpUriRequest.getURI().getHost());
final long time = System.currentTimeMillis();
try {
final long time = System.currentTimeMillis();
this.httpResponse = httpClient.execute(httpUriRequest, httpContext);
final CloseableHttpClient client = clientBuilder.build();
this.httpResponse = client.execute(httpUriRequest, context);
this.httpResponse.setHeader(HeaderFramework.RESPONSE_TIME_MILLIS, Long.toString(System.currentTimeMillis() - time));
} catch (final IOException e) {
ConnectionInfo.removeConnection(httpUriRequest.hashCode());
httpUriRequest.abort();
throw new IOException("Client can't execute: " + (e.getCause() == null ? e.getMessage() : e.getCause().getMessage()));
throw new IOException("Client can't execute: "
+ (e.getCause() == null ? e.getMessage() : e.getCause().getMessage())
+ " duration=" + Long.toString(System.currentTimeMillis() - time));
}
}
@ -669,23 +683,6 @@ public class HTTPClient {
httpUriRequest.setHeader("Authorization", "realm=" + this.realm);
}
private void setParams(final HttpParams httpParams) {
HttpClientParams.setRedirecting(httpParams, this.redirecting);
HttpConnectionParams.setConnectionTimeout(httpParams, this.timeout);
HttpConnectionParams.setSoTimeout(httpParams, this.timeout);
if (this.agent != null)
HttpProtocolParams.setUserAgent(httpParams, this.agent.userAgent);
if (this.host != null)
httpParams.setParameter(HTTP.TARGET_HOST, this.host);
}
private static void setProxy(final HttpParams httpParams, String host) {
if (ProxySettings.useForHost(host, Protocol.HTTP))
ConnRouteParams.setDefaultProxy(httpParams, ProxySettings.getProxyHost());
// TODO find a better way for this
ProxySettings.setProxyCreds((DefaultHttpClient) httpClient);
}
private void storeConnectionInfo(final HttpUriRequest httpUriRequest) {
final int port = httpUriRequest.getURI().getPort();
final String thost = httpUriRequest.getURI().getHost();
@ -699,7 +696,7 @@ public class HTTPClient {
this.upbytes));
}
private static SSLSocketFactory getSSLSocketFactory() {
private static SSLConnectionSocketFactory getSSLSocketFactory() {
final TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(final X509Certificate[] chain, final String authType)
@ -728,7 +725,9 @@ public class HTTPClient {
// e.printStackTrace();
}
final SSLSocketFactory sslSF = new SSLSocketFactory(sslContext, SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
final SSLConnectionSocketFactory sslSF = new SSLConnectionSocketFactory(
sslContext,
SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
return sslSF;
}
@ -739,8 +738,8 @@ public class HTTPClient {
*
* @param defaultHttpClient
*/
private static void addCustomKeepAliveStrategy(final DefaultHttpClient defaultHttpClient) {
defaultHttpClient.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
private static ConnectionKeepAliveStrategy customKeepAliveStrategy() {
return new ConnectionKeepAliveStrategy() {
@Override
public long getKeepAliveDuration(HttpResponse response, HttpContext context) {
// Honor 'keep-alive' header
@ -762,7 +761,7 @@ public class HTTPClient {
// Keep alive for 5 seconds only
return 5 * 1000;
}
});
};
}
/**
@ -773,13 +772,13 @@ public class HTTPClient {
public static void main(final String[] args) {
String url = null;
// prepare Parts
final Map<String,ContentBody> newparts = new LinkedHashMap<String,ContentBody>();
try {
newparts.put("foo", new StringBody("FooBar"));
newparts.put("bar", new StringBody("BarFoo"));
} catch (final UnsupportedEncodingException e) {
System.out.println(e.getStackTrace());
}
// final Map<String,ContentBody> newparts = new LinkedHashMap<String,ContentBody>();
// try {
// newparts.put("foo", new StringBody("FooBar"));
// newparts.put("bar", new StringBody("BarFoo"));
// } catch (final UnsupportedEncodingException e) {
// System.out.println(e.getStackTrace());
// }
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
client.setRedirecting(false);
// Get some
@ -805,7 +804,7 @@ public class HTTPClient {
// for (HeaderElement element: header.getElements())
// System.out.println("Element " + element.getName() + " : " + element.getValue());
}
System.out.println(client.getHttpResponse().getLocale());
// System.out.println(client.getHttpResponse().getLocale());
System.out.println(client.getHttpResponse().getProtocolVersion());
System.out.println(client.getHttpResponse().getStatusLine());
// Post some
@ -822,49 +821,41 @@ public class HTTPClient {
}
}
public static class IdleConnectionMonitorThread extends Thread {
private final HttpClientConnectionManager connMgr;
private volatile boolean shutdown;
public IdleConnectionMonitorThread(HttpClientConnectionManager connMgr) {
super();
this.connMgr = connMgr;
}
/**
*
* @see: http://hc.apache.org/httpcomponents-client-4.0.1/tutorial/html/connmgmt.html#d4e638
*
*/
private static class IdledConnectionEvictor extends Thread {
private final ClientConnectionManager clientConnectionManager;
private volatile boolean shutdown;
public IdledConnectionEvictor(final ClientConnectionManager clientConnectionManager) {
super();
this.clientConnectionManager = clientConnectionManager;
}
@Override
public void run() {
try {
while (!this.shutdown) {
synchronized (this) {
wait(5000);
// Close expired connections
this.clientConnectionManager.closeExpiredConnections();
// Optionally, close connections
// that have been idle longer than 5 sec
// (some SOHO router act strange on >5sec idled connections)
this.clientConnectionManager.closeIdleConnections(5, TimeUnit.SECONDS);
}
}
} catch (final InterruptedException ex) {
// terminate
}
}
public void shutdown() {
this.shutdown = true;
synchronized (this) {
notifyAll();
}
}
@Override
public void run() {
try {
while (!shutdown) {
synchronized (this) {
wait(5000);
// Close expired connections
connMgr.closeExpiredConnections();
// Optionally, close connections
// that have been idle longer than 30 sec
connMgr.closeIdleConnections(30, TimeUnit.SECONDS);
}
}
connMgr.shutdown();
} catch (final InterruptedException ex) {
// terminate
}
}
public void shutdown() {
shutdown = true;
synchronized (this) {
notifyAll();
}
}
}
}

@ -27,10 +27,16 @@ package net.yacy.cora.protocol.http;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.Credentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.impl.client.AbstractHttpClient;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.routing.HttpRoutePlanner;
import org.apache.http.protocol.HttpContext;
/**
* settings for a remote proxy
@ -71,12 +77,36 @@ public final class ProxySettings {
return new HttpHost(host, port);
}
public static void setProxyCreds(AbstractHttpClient httpClient) {
if (!use) return;
httpClient.getCredentialsProvider().setCredentials(
new AuthScope(host, port),
new UsernamePasswordCredentials(user, password));
}
public static HttpRoutePlanner RoutePlanner = new HttpRoutePlanner() {
@Override
public HttpRoute determineRoute(HttpHost target, HttpRequest request, HttpContext context) throws HttpException {
if (use) {
final Protocol protocol = "https".equalsIgnoreCase(target.getSchemeName())? Protocol.HTTPS : Protocol.HTTP;
if (useForHost(target.getHostName(), protocol))
return new HttpRoute(target, null, getProxyHost(), protocol == Protocol.HTTPS);
}
return new HttpRoute(target); // direct
}
};
public static CredentialsProvider CredsProvider = new CredentialsProvider() {
@Override
public void clear() {
}
@Override
public Credentials getCredentials(AuthScope scope) {
if (host != null && host.equals(scope.getHost()) && port == scope.getPort())
return new UsernamePasswordCredentials(user, password);
return null;
}
@Override
public void setCredentials(AuthScope arg0, Credentials arg1) {
}
};
/**
* tell if a remote proxy will be used for the given host

@ -42,8 +42,6 @@ import org.openjena.atlas.logging.Log;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
@ -51,7 +49,6 @@ import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.retrieval.Request;
@ -293,37 +290,6 @@ public class Balancer {
return map;
}
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
/**
* get lists of crawl request entries for a specific host
* @param host
@ -428,13 +394,13 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue;
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
@ -445,7 +411,7 @@ public class Balancer {
}
if (crawlEntry == null) return null;
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here
@ -515,7 +481,7 @@ public class Balancer {
rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) continue; // may have been deleted there manwhile
Request crawlEntry = new Request(rowEntry);
CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue;

@ -1,312 +0,0 @@
/**
* CrawlQueue
* Copyright 2013 by Michael Christen
* First released 30.08.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.table.Table;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
public class CrawlQueue {
private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000;
private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
private BufferedObjectIndex urlFileIndex;
private final HandleSet double_push_check;
public CrawlQueue(
final File cachePath,
final String filename,
final boolean useTailCache,
final boolean exceed134217727) {
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
cachePath.mkdirs();
final File f = new File(cachePath, filename);
try {
this.urlFileIndex = new BufferedObjectIndex(new Table(f, Request.rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true), objectIndexBufferSize);
} catch (final SpaceExceededException e) {
try {
this.urlFileIndex = new BufferedObjectIndex(new Table(f, Request.rowdef, 0, 0, false, exceed134217727, true), objectIndexBufferSize);
} catch (final SpaceExceededException e1) {
ConcurrentLog.logException(e1);
}
}
this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
ConcurrentLog.info("CrawlQueue", "opened queue file with " + this.urlFileIndex.size() + " entries from " + f.toString());
}
public synchronized void close() {
if (this.urlFileIndex != null) {
this.urlFileIndex.close();
this.urlFileIndex = null;
}
}
public void clear() {
ConcurrentLog.info("CrawlQueue", "cleaning CrawlQueue with " + this.urlFileIndex.size() + " entries from " + this.urlFileIndex.filename());
try {
this.urlFileIndex.clear();
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
this.double_push_check.clear();
}
public Request get(final byte[] urlhash) throws IOException {
assert urlhash != null;
if (this.urlFileIndex == null) return null; // case occurs during shutdown
final Row.Entry entry = this.urlFileIndex.get(urlhash, false);
if (entry == null) return null;
return new Request(entry);
}
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
// removes all entries with a specific profile hash.
// this may last some time
// returns number of deletions
// first find a list of url hashes that shall be deleted
final HandleSet urlHashes = new RowHandleSet(this.urlFileIndex.row().primaryKeyLength, Base64Order.enhancedCoder, 100);
final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
synchronized (this) {
final Iterator<Row.Entry> i = this.urlFileIndex.rows();
Row.Entry rowEntry;
Request crawlEntry;
while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
rowEntry = i.next();
crawlEntry = new Request(rowEntry);
if (crawlEntry.profileHandle().equals(profileHandle)) {
urlHashes.put(crawlEntry.url().hash());
}
}
}
// then delete all these urls from the queues and the file index
return remove(urlHashes);
}
/**
* this method is only here, because so many import/export methods need it
and it was implemented in the previous architecture
however, usage is not recommended
* @param urlHashes, a list of hashes that shall be removed
* @return number of entries that had been removed
* @throws IOException
*/
public synchronized int remove(final HandleSet urlHashes) throws IOException {
final int s = this.urlFileIndex.size();
int removedCounter = 0;
for (final byte[] urlhash: urlHashes) {
final Row.Entry entry = this.urlFileIndex.remove(urlhash);
if (entry != null) removedCounter++;
// remove from double-check caches
this.double_push_check.remove(urlhash);
}
if (removedCounter == 0) return 0;
assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s;
return removedCounter;
}
public boolean has(final byte[] urlhashb) {
return this.urlFileIndex.has(urlhashb) || this.double_push_check.has(urlhashb);
}
public int size() {
return this.urlFileIndex.size();
}
public boolean isEmpty() {
return this.urlFileIndex.isEmpty();
}
/**
* push a crawl request on the balancer stack
* @param entry
* @return null if this was successful or a String explaining what went wrong in case of an error
* @throws IOException
* @throws SpaceExceededException
*/
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
assert entry != null;
final byte[] hash = entry.url().hash();
synchronized (this) {
// double-check
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
this.double_push_check.put(hash);
// increase dom counter
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
}
// add to index
final int s = this.urlFileIndex.size();
this.urlFileIndex.put(entry.toRow());
assert s < this.urlFileIndex.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size();
assert this.urlFileIndex.has(hash) : "hash = " + ASCII.String(hash);
// add the hash to a queue if the host is unknown to get this fast into the balancer
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
}
robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
return null;
}
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
/**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned
* @param delay true if the requester demands forced delays using explicit thread sleep
* @param profile
* @return a url in a CrawlEntry object
* @throws IOException
* @throws SpaceExceededException
*/
public Request pop(final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
if (this.urlFileIndex.isEmpty()) return null;
long sleeptime = 0;
Request crawlEntry = null;
CrawlProfile profileEntry = null;
while (this.urlFileIndex.size() > 0) {
synchronized (this) {
Row.Entry rowEntry = this.urlFileIndex.removeOne();
if (rowEntry == null) return null;
crawlEntry = new Request(rowEntry);
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("CrawlQueue", "no profile entry for handle " + crawlEntry.profileHandle());
return null;
}
// check blacklist (again) because the user may have created blacklist entries after the queue has been filled
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
ConcurrentLog.fine("CrawlQueue", "URL '" + crawlEntry.url() + "' is in blacklist.");
continue;
}
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("CrawlQueue", "no profile entry for handle " + crawlEntry.profileHandle());
continue;
}
}
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
ConcurrentLog.info("CrawlQueue", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent));
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 3) {
rest = rest + 1000 * loops;
loops = 0;
}
Thread.currentThread().setName("CrawlQueue waiting for " +crawlEntry.url().getHost() + ": " + sleeptime + " milliseconds");
synchronized(this) {
// must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough
if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}}
for (int i = 0; i < loops; i++) {
ConcurrentLog.info("CrawlQueue", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {this.wait(1000); } catch (final InterruptedException e) {}
}
}
Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
}
return crawlEntry;
}
}

@ -48,8 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
@ -149,7 +147,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
} catch (final Exception e) {
@ -294,7 +292,8 @@ public final class CrawlStacker {
public String stackCrawl(final Request entry) {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
byte[] handle = UTF8.getBytes(entry.profileHandle());
final CrawlProfile profile = this.crawler.get(handle);
String error;
if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
@ -302,7 +301,9 @@ public final class CrawlStacker {
return error;
}
error = checkAcceptance(entry.url(), profile, entry.depth());
error = checkAcceptanceChangeable(entry.url(), profile, entry.depth());
if (error != null) return error;
error = checkAcceptanceInitially(entry.url(), profile);
if (error != null) return error;
// store information
@ -366,53 +367,16 @@ public final class CrawlStacker {
return null;
}
public String checkAcceptance(final DigestURL url, final CrawlProfile profile, final int depth) {
/**
* Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl
* These tests are incomplete and must be followed with an checkAcceptanceChangeable - test.
* @param url
* @param profile
* @return null if the url is accepted, an error string in case if the url is not accepted with an error description
*/
public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();
final String urlstring = url.toString();
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
this.log.fine("URL '" + urlstring + "' is in blacklist.");
return "url in blacklist";
}
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
}
// filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
}
// deny cgi
if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
return "individual url (sessionid etc) not wanted";
}
// deny post properties
if (url.isPOST() && !profile.crawlingQ()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
return "post url not allowed";
}
// check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
@ -451,13 +415,72 @@ public final class CrawlStacker {
final AtomicInteger dp = profile.getCount(url.getHost());
if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "crawl stack domain counter exceeded";
return "crawl stack domain counter exceeded (test by profile)";
}
/*
if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "result stack domain counter exceeded";
return "result stack domain counter exceeded (test by domainCount)";
}
*/
}
return null;
}
/**
* Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl.
* @param url
* @param profile
* @param depth
* @return null if the url is accepted, an error string in case if the url is not accepted with an error description
*/
public String checkAcceptanceChangeable(final DigestURL url, final CrawlProfile profile, final int depth) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();
final String urlstring = url.toString();
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
this.log.fine("URL '" + urlstring + "' is in blacklist.");
return "url in blacklist";
}
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
}
// filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
}
// deny cgi
if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
return "individual url (sessionid etc) not wanted";
}
// deny post properties
if (url.isPOST() && !profile.crawlingQ()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
return "post url not allowed";
}
// the following filters use a DNS lookup to check if the url matches with IP filter
@ -498,7 +521,6 @@ public final class CrawlStacker {
return null;
}
/**
* Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global)

@ -80,8 +80,8 @@ public final class CrawlSwitchboard {
DEFAULT_PROFILES.add(CRAWL_PROFILE_SURROGATE);
}
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive1.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -103,21 +103,23 @@ public final class CrawlSwitchboard {
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
private Switchboard switchboard;
public CrawlSwitchboard(final String networkName, final ConcurrentLog log, final File queuesRoot) {
public CrawlSwitchboard(final String networkName, Switchboard switchboard) {
log.info("Initializing Word Index for the network '" + networkName + "'.");
this.switchboard = switchboard;
this.log = this.switchboard.log;
this.queuesRoot = this.switchboard.queuesRoot;
this.log.info("Initializing Word Index for the network '" + networkName + "'.");
if ( networkName == null || networkName.isEmpty() ) {
log.severe("no network name given - shutting down");
System.exit(0);
}
this.log = log;
this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));
this.profilesActiveCrawlsCounter = new ConcurrentHashMap<String, RowHandleSet>();
// make crawl profiles database and default profiles
this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs();
this.log.config("Initializing Crawl Profiles");
@ -166,6 +168,23 @@ public final class CrawlSwitchboard {
/ 1024);
}
/**
* Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles.
* A profile that was discovered from the passive stack is automatically shifted back to the active stack.
* @param profileKey
* @return
*/
public CrawlProfile get(final byte[] profileKey) {
CrawlProfile profile = getActive(profileKey);
if (profile != null) return profile;
profile = getPassive(profileKey);
if (profile == null) return null;
// clean up
this.putActive(profileKey, profile);
this.removePassive(profileKey);
return profile;
}
public CrawlProfile getActive(final byte[] profileKey) {
if ( profileKey == null ) {
return null;
@ -237,10 +256,12 @@ public final class CrawlSwitchboard {
public void putActive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesActiveCrawls.put(profileKey, profile);
this.profilesActiveCrawlsCache.put(profileKey, profile);
this.removePassive(profileKey);
}
public void putPassive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesPassiveCrawls.put(profileKey, profile);
this.removeActive(profileKey);
}
public RowHandleSet getURLHashes(final byte[] profileKey) {
@ -534,7 +555,7 @@ public final class CrawlSwitchboard {
return hasDoneSomething;
}
public int cleanFinishesProfiles(CrawlQueues crawlQueues) {
public Set<String> getFinishesProfiles(CrawlQueues crawlQueues) {
// clear the counter cache
this.profilesActiveCrawlsCounter.clear();
@ -547,7 +568,7 @@ public final class CrawlSwitchboard {
deletionCandidate.add(ASCII.String(handle));
}
}
if (deletionCandidate.size() == 0) return 0;
if (deletionCandidate.size() == 0) return new HashSet<String>(0);
// iterate through all the queues and see if one of these handles appear there
// this is a time-consuming process, set a time-out
@ -564,15 +585,24 @@ public final class CrawlSwitchboard {
if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);}
if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many
deletionCandidate.remove(handle);
if (deletionCandidate.size() == 0) return 0;
if (System.currentTimeMillis() > timeout) return 0; // give up; this is too large
if (deletionCandidate.size() == 0) return new HashSet<String>(0);
if (System.currentTimeMillis() > timeout) return new HashSet<String>(0); // give up; this is too large
}
if (deletionCandidate.size() == 0) return 0;
if (deletionCandidate.size() == 0) return new HashSet<String>(0);
}
// look into the CrawlQueues.worker as well
Request[] requests = switchboard.crawlQueues.activeWorkerEntries();
for (Request request: requests) {
deletionCandidate.remove(request.profileHandle());
}
} catch (final Throwable e) {
return 0;
ConcurrentLog.logException(e);
return new HashSet<String>(0);
}
return deletionCandidate;
}
public void cleanProfiles(Set<String> deletionCandidate) {
// all entries that are left are candidates for deletion; do that now
for (String h: deletionCandidate) {
byte[] handle = ASCII.getBytes(h);
@ -582,7 +612,6 @@ public final class CrawlSwitchboard {
this.removeActive(handle);
}
}
return deletionCandidate.size();
}
public synchronized void close() {

@ -0,0 +1,256 @@
/**
* HostQueue
* Copyright 2013 by Michael Christen
* First released 24.09.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.table.Table;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
public class HostQueue {
public static final String indexSuffix = ".stack";
private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000;
private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
private final String hostHash;
private final File queuesPath;
private BufferedObjectIndex requestStack;
private HandleSet urlHashDoubleCheck;
public HostQueue(
final File queuesPath,
final String hostHash,
final boolean useTailCache,
final boolean exceed134217727) {
this.hostHash = hostHash;
this.queuesPath = queuesPath;
this.urlHashDoubleCheck = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
// create a stack for newly entered entries
if (!(this.queuesPath.exists())) this.queuesPath.mkdir(); // make the path
this.queuesPath.mkdirs();
final File f = new File(this.queuesPath, this.hostHash + indexSuffix);
try {
this.requestStack = new BufferedObjectIndex(new Table(f, Request.rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true), objectIndexBufferSize);
} catch (final SpaceExceededException e) {
try {
this.requestStack = new BufferedObjectIndex(new Table(f, Request.rowdef, 0, 0, false, exceed134217727, true), objectIndexBufferSize);
} catch (final SpaceExceededException e1) {
ConcurrentLog.logException(e1);
}
}
ConcurrentLog.info("Balancer", "opened balancer file with " + this.requestStack.size() + " entries from " + f.toString());
}
public synchronized void close() {
int sizeBeforeClose = this.size();
if (this.urlHashDoubleCheck != null) {
this.urlHashDoubleCheck.clear();
this.urlHashDoubleCheck = null;
}
if (this.requestStack != null) {
this.requestStack.close();
this.requestStack = null;
}
if (sizeBeforeClose == 0) {
// clean up
new File(this.queuesPath, this.hostHash + indexSuffix).delete();
}
}
public void clear() {
try {
this.requestStack.clear();
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
this.urlHashDoubleCheck.clear();
}
public Request get(final byte[] urlhash) throws IOException {
assert urlhash != null;
if (this.requestStack == null) return null; // case occurs during shutdown
final Row.Entry entry = this.requestStack.get(urlhash, false);
if (entry == null) return null;
return new Request(entry);
}
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
// first find a list of url hashes that shall be deleted
final HandleSet urlHashes = new RowHandleSet(this.requestStack.row().primaryKeyLength, Base64Order.enhancedCoder, 100);
final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
synchronized (this) {
final Iterator<Row.Entry> i = this.requestStack.rows();
Row.Entry rowEntry;
Request crawlEntry;
while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
rowEntry = i.next();
crawlEntry = new Request(rowEntry);
if (crawlEntry.profileHandle().equals(profileHandle)) {
urlHashes.put(crawlEntry.url().hash());
}
}
}
// then delete all these urls from the queues and the file index
return remove(urlHashes);
}
/**
* remove urls from the queue
* @param urlHashes, a list of hashes that shall be removed
* @return number of entries that had been removed
* @throws IOException
*/
public synchronized int remove(final HandleSet urlHashes) throws IOException {
final int s = this.requestStack.size();
int removedCounter = 0;
for (final byte[] urlhash: urlHashes) {
final Row.Entry entry = this.requestStack.remove(urlhash);
if (entry != null) removedCounter++;
// remove from double-check caches
this.urlHashDoubleCheck.remove(urlhash);
}
if (removedCounter == 0) return 0;
assert this.requestStack.size() + removedCounter == s : "urlFileIndex.size() = " + this.requestStack.size() + ", s = " + s;
return removedCounter;
}
public boolean has(final byte[] urlhashb) {
return this.requestStack.has(urlhashb) || this.urlHashDoubleCheck.has(urlhashb);
}
public int size() {
return this.requestStack.size();
}
public boolean isEmpty() {
return this.requestStack.isEmpty();
}
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
assert entry != null;
final byte[] hash = entry.url().hash();
synchronized (this) {
// double-check
if (this.urlHashDoubleCheck.has(hash)) return "double occurrence in double_push_check";
if (this.requestStack.has(hash)) return "double occurrence in urlFileIndex";
if (this.urlHashDoubleCheck.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.urlHashDoubleCheck.clear();
this.urlHashDoubleCheck.put(hash);
// increase dom counter
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
}
// add to index
final int s = this.requestStack.size();
this.requestStack.put(entry.toRow());
assert s < this.requestStack.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.requestStack.size();
assert this.requestStack.has(hash) : "hash = " + ASCII.String(hash);
// add the hash to a queue if the host is unknown to get this fast into the balancer
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
}
robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
return null;
}
public Request pop() throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
Request crawlEntry = null;
while (!this.requestStack.isEmpty()) {
synchronized (this) {
Row.Entry rowEntry = this.requestStack.removeOne();
if (rowEntry == null) return null;
crawlEntry = new Request(rowEntry);
// check blacklist (again) because the user may have created blacklist entries after the queue has been filled
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
ConcurrentLog.fine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist.");
continue;
}
break;
}
}
if (crawlEntry == null) return null;
return crawlEntry;
}
public Iterator<Request> iterator() throws IOException {
return new EntryIterator();
}
private class EntryIterator implements Iterator<Request> {
private Iterator<Row.Entry> rowIterator;
public EntryIterator() throws IOException {
this.rowIterator = HostQueue.this.requestStack.rows();
}
@Override
public boolean hasNext() {
return (this.rowIterator == null) ? false : this.rowIterator.hasNext();
}
@Override
public Request next() {
final Row.Entry entry = this.rowIterator.next();
try {
return (entry == null) ? null : new Request(entry);
} catch (final IOException e) {
ConcurrentLog.logException(e);
this.rowIterator = null;
return null;
}
}
@Override
public void remove() {
if (this.rowIterator != null) this.rowIterator.remove();
}
}
}

@ -0,0 +1,169 @@
/**
* HostQueues
* Copyright 2013 by Michael Christen
* First released 24.09.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
/**
* wrapper for single HostQueue queues; this is a collection of such queues.
* All these queues are stored in a common directory for the queue stacks
*/
public class HostQueues {
private final File queuesPath;
private final boolean useTailCache;
private final boolean exceed134217727;
private final Map<String, HostQueue> queues;
public HostQueues(
final File queuesPath,
final boolean useTailCache,
final boolean exceed134217727) {
this.queuesPath = queuesPath;
this.useTailCache = useTailCache;
this.exceed134217727 = exceed134217727;
// create a stack for newly entered entries
if (!(queuesPath.exists())) queuesPath.mkdir(); // make the path
this.queuesPath.mkdirs();
this.queues = new HashMap<String, HostQueue>();
String[] list = this.queuesPath.list();
for (String queuefile: list) {
if (queuefile.endsWith(HostQueue.indexSuffix)) {
String hosthash = queuefile.substring(0, queuefile.length() - HostQueue.indexSuffix.length());
HostQueue queue = new HostQueue(this.queuesPath, hosthash, this.useTailCache, this.exceed134217727);
this.queues.put(hosthash, queue);
}
}
}
public synchronized void close() {
for (HostQueue queue: this.queues.values()) queue.close();
this.queues.clear();
}
public void clear() {
for (HostQueue queue: this.queues.values()) queue.clear();
this.queues.clear();
}
public Request get(final byte[] urlhash) throws IOException {
String hosthash = ASCII.String(urlhash, 6, 6);
HostQueue queue = this.queues.get(hosthash);
if (queue == null) return null;
return queue.get(urlhash);
}
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
int c = 0;
for (HostQueue queue: this.queues.values()) c += queue.removeAllByProfileHandle(profileHandle, timeout);
return c;
}
public synchronized int remove(final HandleSet urlHashes) throws IOException {
Map<String, HandleSet> removeLists = new HashMap<String, HandleSet>();
for (byte[] urlhash: urlHashes) {
String hosthash = ASCII.String(urlhash, 6, 6);
HandleSet removeList = removeLists.get(hosthash);
if (removeList == null) {
removeList = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 100);
removeLists.put(hosthash, removeList);
}
try {removeList.put(urlhash);} catch (SpaceExceededException e) {}
}
int c = 0;
for (Map.Entry<String, HandleSet> entry: removeLists.entrySet()) {
HostQueue queue = this.queues.get(entry.getKey());
if (queue != null) c += queue.remove(entry.getValue());
}
return c;
}
public boolean has(final byte[] urlhashb) {
String hosthash = ASCII.String(urlhashb, 6, 6);
HostQueue queue = this.queues.get(hosthash);
if (queue == null) return false;
return queue.has(urlhashb);
}
public int size() {
int c = 0;
for (HostQueue queue: this.queues.values()) c += queue.size();
return c;
}
public boolean isEmpty() {
for (HostQueue queue: this.queues.values()) if (!queue.isEmpty()) return false;
return true;
}
/**
* push a request to one of the host queues. If the queue does not exist, it is created
* @param entry
* @param profile
* @param robots
* @return null if everything is ok or a string with an error message if the push is not allowed according to the crawl profile or robots
* @throws IOException
* @throws SpaceExceededException
*/
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
String hosthash = ASCII.String(entry.url().hash(), 6, 6);
HostQueue queue = this.queues.get(hosthash);
if (queue == null) {
queue = new HostQueue(this.queuesPath, hosthash, this.useTailCache, this.exceed134217727);
this.queues.put(hosthash, queue);
}
return queue.push(entry, profile, robots);
}
/**
* remove one request from all stacks except from those as listed in notFromHost
* @param notFromHost do not collect from these hosts
* @return a list of requests
* @throws IOException
*/
public List<Request> pop(Set<String> notFromHost) throws IOException {
ArrayList<Request> requests = new ArrayList<Request>();
for (Map.Entry<String, HostQueue> entry: this.queues.entrySet()) {
if (notFromHost.contains(entry.getKey())) continue;
Request r = entry.getValue().pop();
if (r != null) requests.add(r);
}
return requests;
}
}

@ -148,7 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
if (name.length() > 256) name = name.substring(256);
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength);
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength);
put(HANDLE, handle);
put(NAME, name);
put(AGENT_NAME, userAgentName);

@ -255,7 +255,7 @@ public class CrawlQueues {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(profileHandle));
if (profile == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
@ -269,14 +269,13 @@ public class CrawlQueues {
if (urlEntry == null) {
continue;
}
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
if (profileHandle == null) {
if (urlEntry.profileHandle() == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
load(urlEntry, stats, profileHandle);
load(urlEntry, stats);
return true;
} catch (final IOException e) {
this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
@ -296,8 +295,8 @@ public class CrawlQueues {
* @param stats String for log prefixing
* @return
*/
private void load(final Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
private void load(final Request urlEntry, final String stats) {
final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(urlEntry.profileHandle()));
if (profile != null) {
// check if the protocol is supported
@ -574,11 +573,7 @@ public class CrawlQueues {
try {
final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler, this.sb.robots);
if (urlEntry == null) return false;
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
// urlEntry.url());
load(urlEntry, stats, profileHandle);
load(urlEntry, stats);
return true;
} catch (final IOException e) {
this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
@ -606,7 +601,7 @@ public class CrawlQueues {
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle()));
}
private long age() {

@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry;
@ -262,6 +263,37 @@ public class Latency {
return s.toString();
}
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
public static final class Host {
private AtomicLong timeacc;
private AtomicLong lastacc;

@ -101,7 +101,7 @@ public class FTPLoader {
// create new ftp client
final FTPClient ftpClient = new FTPClient();
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
// get a connection
if (openConnection(ftpClient, entryUrl)) {
// test if the specified file is a directory
@ -249,7 +249,7 @@ public class FTPLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response(
request,
requestHeader,
@ -264,7 +264,7 @@ public class FTPLoader {
final byte[] b = ftpClient.get(path);
// create a response
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response(
request,
requestHeader,

@ -83,7 +83,7 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@ -123,7 +123,7 @@ public class FileLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@ -140,7 +140,7 @@ public class FileLoader {
is.close();
// create response with loaded content
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,

@ -589,7 +589,7 @@ public class Response {
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
final Date ifModifiedSince = this.requestHeader.ifModifiedSince();
final Date ifModifiedSince = this.ifModifiedSince();
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
// parse date
Date d = this.responseHeader.lastModified();

@ -101,7 +101,7 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@ -141,7 +141,7 @@ public class SMBLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
@ -158,7 +158,7 @@ public class SMBLoader {
is.close();
// create response with loaded content
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,

@ -575,7 +575,7 @@ public final class Protocol {
maximumRecords,
verify,
global,
null);
ClientIdentification.yacyInternetCrawlerAgent);
}
protected static int primarySearch(

@ -187,7 +187,7 @@ public final class LoaderDispatcher {
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
final String protocol = url.getProtocol();
final String host = url.getHost();
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {

@ -536,7 +536,7 @@ public final class Switchboard extends serverSwitch {
}
// create a crawler
this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot);
this.crawler = new CrawlSwitchboard(networkName, this);
// start yacy core
this.log.config("Starting YaCy Protocol Core");
@ -1330,7 +1330,7 @@ public final class Switchboard extends serverSwitch {
// create a crawler
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot);
this.crawler = new CrawlSwitchboard(networkName, this);
// init a DHT transmission dispatcher
this.dhtDispatcher =
@ -2009,7 +2009,7 @@ public final class Switchboard extends serverSwitch {
// clear caches
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
Word.clearCache();
Domains.clear();
// Domains.clear();
// clean up image stack
ResultImages.clearQueues();
@ -2130,9 +2130,24 @@ public final class Switchboard extends serverSwitch {
// clean up profiles
checkInterruption();
//cleanProfiles();
int cleanup = this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? 0 : this.crawler.cleanFinishesProfiles(this.crawlQueues);
if (cleanup > 0) log.info("cleanup removed " + cleanup + " crawl profiles");
if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
Set<String> deletionCandidates = this.crawler.getFinishesProfiles(this.crawlQueues);
int cleanup = deletionCandidates.size();
if (cleanup > 0) {
// run postprocessing on these profiles
postprocessingRunning = true;
int proccount = 0;
for (String profileHash: deletionCandidates) {
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, profileHash);
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, profileHash);
}
postprocessingRunning = false;
this.crawler.cleanProfiles(deletionCandidates);
log.info("cleanup removed " + cleanup + " crawl profiles, post-processed " + proccount + " documents");
}
}
// clean up news
checkInterruption();
@ -2268,11 +2283,14 @@ public final class Switchboard extends serverSwitch {
// if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned
if (this.crawlQueues.coreCrawlJobSize() == 0) {
if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches
if (this.crawlQueues.noticeURL.isEmpty()) {
Domains.clear();
this.crawlQueues.noticeURL.clear(); // flushes more caches
}
postprocessingRunning = true;
int proccount = 0;
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index);
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index);
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, null);
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, null);
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
@ -2490,13 +2508,13 @@ public final class Switchboard extends serverSwitch {
if (response.profile() != null) {
ArrayList<Document> newDocs = new ArrayList<Document>();
for (Document doc: documents) {
String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
String rejectReason = this.crawlStacker.checkAcceptanceChangeable(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
if (rejectReason == null) {
newDocs.add(doc);
} else {
// we consider this as fail urls to have a tracking of the problem
if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle()));
final CrawlProfile profile = this.crawler.get(UTF8.getBytes(response.profile().handle()));
this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
}
@ -2659,18 +2677,28 @@ public final class Switchboard extends serverSwitch {
// the condenser may be null in case that an indexing is not wanted (there may be a no-indexing flag in the file)
if ( in.condenser != null ) {
for ( int i = 0; i < in.documents.length; i++ ) {
CrawlProfile profile = in.queueEntry.profile();
storeDocumentIndex(
in.queueEntry,
in.queueEntry.profile().collections(),
in.documents[i],
in.condenser[i],
null,
"crawler/indexing queue");
profile == null ? "crawler" : profile.handle());
}
}
in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED);
}
/**
*
* @param queueEntry
* @param collections
* @param document
* @param condenser
* @param searchEvent
* @param sourceName if this document was created by a crawl, then the sourceName contains the crawl hash
*/
private void storeDocumentIndex(
final Response queueEntry,
final Map<String, Pattern> collections,
@ -2821,7 +2849,7 @@ public final class Switchboard extends serverSwitch {
public void stackURLs(Set<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) {
if (rootURLs == null || rootURLs.size() == 0) return;
List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
final List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
for (DigestURL url: rootURLs) {
final DigestURL turl = url;
Thread t = new Thread() {
@ -2832,9 +2860,9 @@ public final class Switchboard extends serverSwitch {
};
t.start();
stackthreads.add(t);
try {Thread.sleep(10);} catch (final InterruptedException e) {} // to prevent that this fires more than 100 connections pre second!
try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second!
}
long waitingtime = 1 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}
}
@ -2974,8 +3002,8 @@ public final class Switchboard extends serverSwitch {
continue;
}
final Request request = this.loader.request(e.getValue(), true, true);
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptanceChangeable(e.getValue(), profile, 0);
if (acceptedError != null) {
this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
continue;
@ -3004,7 +3032,7 @@ public final class Switchboard extends serverSwitch {
final Document[] documents = response.parse();
if (documents != null) {
for (final Document document: documents) {
final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle()));
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url);
}
@ -3047,8 +3075,9 @@ public final class Switchboard extends serverSwitch {
if (existingids.contains(e.getKey())) continue; // double
DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
String acceptedError = this.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile);
if (acceptedError != null) {
this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
return;

@ -599,7 +599,7 @@ public class Segment {
final Document document,
final Condenser condenser,
final SearchEvent searchEvent,
final String sourceName,
final String sourceName, // contains the crawl profile hash if this comes from a web crawl
final boolean storeToRWI
) {
final long startTime = System.currentTimeMillis();
@ -619,7 +619,7 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration(), sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {

@ -79,8 +79,16 @@ public final class QueryParams {
}
}
private static final CollectionSchema[] defaultfacetfields = new CollectionSchema[]{
CollectionSchema.host_s, CollectionSchema.url_protocol_s, CollectionSchema.url_file_ext_s, CollectionSchema.author_sxt};
private static final Map<String, CollectionSchema> defaultfacetfields = new HashMap<String, CollectionSchema>();
static {
// the key shall match with configuration property search.navigation
defaultfacetfields.put("location", CollectionSchema.coordinate_p);
defaultfacetfields.put("hosts", CollectionSchema.host_s);
defaultfacetfields.put("protocol", CollectionSchema.url_protocol_s);
defaultfacetfields.put("filetype", CollectionSchema.url_file_ext_s);
defaultfacetfields.put("authors", CollectionSchema.author_sxt);
//missing: namespace
}
private static final int defaultmaxfacets = 30;
private static final String ampersand = "&amp;";
@ -132,7 +140,8 @@ public final class QueryParams {
final Bitfield constraint,
final Segment indexSegment,
final RankingProfile ranking,
final String userAgent) {
final String userAgent,
final String[] search_navigation) {
this.queryGoal = new QueryGoal(query_original, query_words);
this.ranking = ranking;
this.modifier = new QueryModifier();
@ -169,8 +178,9 @@ public final class QueryParams {
this.facetfields = new LinkedHashSet<String>();
this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
for (CollectionSchema f: defaultfacetfields) {
if (solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
for (String navkey: search_navigation) {
CollectionSchema f = defaultfacetfields.get(navkey);
if (f != null && solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
}
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
this.maxfacets = defaultmaxfacets;
@ -205,7 +215,8 @@ public final class QueryParams {
final boolean filterscannerfail,
final double lat,
final double lon,
final double radius
final double radius,
final String[] search_navigation
) {
this.queryGoal = queryGoal;
this.modifier = modifier;
@ -269,8 +280,9 @@ public final class QueryParams {
this.facetfields = new LinkedHashSet<String>();
this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
for (CollectionSchema f: defaultfacetfields) {
if (solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
for (String navkey: search_navigation) {
CollectionSchema f = defaultfacetfields.get(navkey);
if (f != null && solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
}
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
this.maxfacets = defaultmaxfacets;

@ -136,6 +136,7 @@ public final class SearchEvent {
private Thread localsolrsearch;
private int localsolroffset;
private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons
public final ScoreMap<String> locationNavigator; // a counter for the appearance of location coordinates
public final ScoreMap<String> hostNavigator; // a counter for the appearance of host names
public final ScoreMap<String> authorNavigator; // a counter for the appearances of authors
public final ScoreMap<String> namespaceNavigator; // a counter for name spaces
@ -225,6 +226,7 @@ public final class SearchEvent {
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
// prepare configured search navigation
final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", "");
this.locationNavigator = navcfg.contains("location") ? new ConcurrentScoreMap<String>() : null;
this.authorNavigator = navcfg.contains("authors") ? new ConcurrentScoreMap<String>() : null;
this.namespaceNavigator = navcfg.contains("namespace") ? new ConcurrentScoreMap<String>() : null;
this.hostNavigator = navcfg.contains("hosts") ? new ConcurrentScoreMap<String>() : null;
@ -741,6 +743,17 @@ public final class SearchEvent {
// collect navigation information
ReversibleScoreMap<String> fcts;
if (this.locationNavigator != null) {
fcts = facets.get(CollectionSchema.coordinate_p.getSolrFieldName());
if (fcts != null) {
for (String coordinate: fcts) {
int hc = fcts.get(coordinate);
if (hc == 0) continue;
this.locationNavigator.inc(coordinate, hc);
}
}
}
if (this.hostNavigator != null) {
fcts = facets.get(CollectionSchema.host_s.getSolrFieldName());
if (fcts != null) {

@ -84,8 +84,8 @@ import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@ -169,52 +169,33 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
}
/**
* Convert a SolrDocument to a SolrInputDocument.
* This is useful if a document from the search index shall be modified and indexed again.
* This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process.
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
}
}
return sid;
return toSolrInputDocument(doc, omitFields);
}
public SolrDocument toSolrDocument(final SolrInputDocument doc) {
SolrDocument sd = new SolrDocument();
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema
sd.setField(field.getName(), field.getValue());
}
}
return sd;
return toSolrDocument(doc, omitFields);
}
/**
* add uri attributes to solr document
* @param doc
* @param allAttr
* @param digestURI
* @param digestURL
* @param doctype
* @return the normalized url
*/
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURI, final char doctype) {
add(doc, CollectionSchema.id, ASCII.String(digestURI.hash()));
String us = digestURI.toNormalform(true);
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL, final char doctype) {
add(doc, CollectionSchema.id, ASCII.String(digestURL.hash()));
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash());
String us = digestURL.toNormalform(true);
add(doc, CollectionSchema.sku, us);
if (allAttr || contains(CollectionSchema.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
final InetAddress address = digestURL.getInetAddress();
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
}
String host = null;
if ((host = digestURI.getHost()) != null) {
if ((host = digestURL.getHost()) != null) {
String dnc = Domains.getDNC(host);
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
int p = subdomOrga.lastIndexOf('.');
@ -228,17 +209,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// path elements of link
String filename = digestURI.getFileName();
String filename = digestURL.getFileName();
String extension = MultiProtocolURL.getFileExtension(filename);
if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length());
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol());
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol());
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURL.getPaths());
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype));
Map<String, String> searchpart = digestURI.getSearchpartMap();
Map<String, String> searchpart = digestURL.getSearchpartMap();
if (searchpart == null) {
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0);
} else {
@ -309,7 +290,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate());
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate());
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash()));
if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5());
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
@ -357,27 +337,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
public SolrVector yacy2solr(
final String id, final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
final IndexCell<CitationReference> citations,
final WebgraphConfiguration webgraph) {
final WebgraphConfiguration webgraph, final String sourceName) {
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
final DigestURL digestURI = document.dc_source();
final DigestURL digestURL = document.dc_source();
final String id = ASCII.String(digestURL.hash());
boolean allAttr = this.isEmpty();
String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
add(doc, CollectionSchema.id, id);
String us = digestURI.toNormalform(true);
String us = digestURL.toNormalform(true);
int clickdepth = 999;
if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
if (digestURI.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
if (digestURL.probablyRootURL()) {
clickdepth = 0;
this.lazy = lc;
} else {
clickdepth = 999;
}
@ -693,7 +671,23 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// canonical tag
if (allAttr || contains(CollectionSchema.canonical_s)) {
final DigestURL canonical = html.getCanonical();
DigestURL canonical = html.getCanonical();
// if there is no canonical in the html then look into the http header:
if (canonical == null) {
String link = responseHeader.get("Link", null);
int p;
if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) {
link = link.substring(0, p).trim();
p = link.indexOf('<');
int q = link.lastIndexOf('>');
if (p >= 0 && q > 0) {
link = link.substring(p + 1, q);
try {
canonical = new DigestURL(link);
} catch (MalformedURLException e) {}
}
}
}
if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
containsCanonical = true;
inboundLinks.remove(canonical);
@ -712,7 +706,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (refresh != null && refresh.length() > 0) {
MultiProtocolURL refreshURL;
try {
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURI, html.getRefreshPath());
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath());
if (refreshURL != null) {
inboundLinks.remove(refreshURL);
outboundLinks.remove(refreshURL);
@ -785,7 +779,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
String content = document.getTextString();
String tokens = digestURI.toTokens();
String tokens = digestURL.toTokens();
if (content == null || content.length() == 0) {
content = tokens;
} else {
@ -798,9 +792,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURI.getFileName()))) {
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
content = digestURI.toTokens(); // remove all other entry but the url tokens
content = digestURL.toTokens(); // remove all other entry but the url tokens
}
// content (must be written after special parser data, since this can influence the content)
@ -824,7 +818,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// create a subgraph
if (!containsCanonical) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations);
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations, sourceName);
}
// list all links
@ -850,7 +844,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength());
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate);
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash()));
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher());
@ -874,6 +867,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
List<String> p = new ArrayList<String>();
for (ProcessType t: processTypes) p.add(t.name());
add(doc, CollectionSchema.process_sxt, p);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(doc, CollectionSchema.harvestkey_s, sourceName);
}
}
return doc;
}
@ -885,16 +881,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param urlCitation
* @return
*/
public int postprocessing(final Segment segment) {
public int postprocessing(final Segment segment, String harvestkey) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation()) return 0;
SolrConnector connector = segment.fulltext().getDefaultConnector();
connector.commit(true); // make sure that we have latest information that can be found
SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
collectionConnector.commit(true); // make sure that we have latest information that can be found
ReferenceReportCache rrCache = segment.getReferenceReportCache();
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
ReversibleScoreMap<String> hostscore = null;
try {
// collect hosts from index which shall take part in citation computation
ReversibleScoreMap<String> hostscore = connector.getFacets(CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(), 10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
hostscore = collectionConnector.getFacets(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
// for each host, do a citation rank computation
for (String host: hostscore.keyList(true)) {
@ -912,11 +913,49 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
ranking.putAll(crn); // accumulate this here for usage in document update later
}
} catch (final IOException e2) {
hostscore = new ClusteredScoreMap<String>();
}
// process all documents
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
// process all documents at the webgraph for the outgoing links of this document
SolrDocument doc;
if (webgraphConnector != null) {
for (String host: hostscore.keyList(true)) {
if (hostscore.get(host) <= 0) continue;
// select all webgraph edges and modify their cr value
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
0, 10000000, 60000, 50);
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
boolean changed = false;
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
}
id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
}
if (changed) try {
webgraphConnector.add(sid);
} catch (SolrException e) {
} catch (IOException e) {
}
}
} catch (final InterruptedException e) {}
}
}
// process all documents in collection
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
0, 10000, 60000, 50);
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
Set<String> uniqueURLs = new HashSet<String>();
@ -964,12 +1003,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++;
// all processing steps checked, remove the processing tag
// all processing steps checked, remove the processing and harvesting key
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
// send back to index
//connector.deleteById(ASCII.String(id));
connector.add(sid);
collectionConnector.add(sid);
proccount++;
} catch (final Throwable e1) {
}
@ -1269,6 +1310,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
configuration.add(doc, CollectionSchema.collection_sxt, cs);
}
// clickdepth, cr and postprocessing
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) {
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index
}
if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) {
processTypes.add(ProcessType.CITATION); // postprocessing needed
}
if (allAttr || configuration.contains(CollectionSchema.process_sxt)) {
List<String> p = new ArrayList<String>();
for (ProcessType t: processTypes) p.add(t.name());
configuration.add(doc, CollectionSchema.process_sxt, p);
}
return doc;
}

@ -59,6 +59,7 @@ public enum CollectionSchema implements SchemaDeclaration {
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// optional but recommended, part of index distribution
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
@ -231,6 +232,23 @@ public enum CollectionSchema implements SchemaDeclaration {
this.omitNorms = omitNorms;
this.searchable = searchable;
this.comment = comment;
// verify our naming scheme
String name = this.name();
int p = name.indexOf('_');
if (p > 0) {
String ext = name.substring(p + 1);
assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name;
assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name;
assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name;
assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;
assert !ext.equals("val") || (type == SolrType.num_integer && multiValued) : name;
assert !ext.equals("d") || (type == SolrType.num_double && !multiValued) : name;
}
assert type.appropriateName(this) : "bad configuration: " + this.name();
}

@ -117,7 +117,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
final IndexCell<CitationReference> citations) {
final IndexCell<CitationReference> citations, final String sourceName) {
boolean allAttr = this.isEmpty();
int target_order = 0;
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
@ -284,6 +284,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
List<String> pr = new ArrayList<String>();
for (ProcessType t: processTypes) pr.add(t.name());
add(edge, WebgraphSchema.process_sxt, pr);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(edge, CollectionSchema.harvestkey_s, sourceName);
}
}
// add the edge to the subgraph
@ -291,7 +294,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
}
public int postprocessing(Segment segment) {
public int postprocessing(final Segment segment, final String harvestkey) {
if (!this.contains(WebgraphSchema.process_sxt)) return 0;
if (!segment.connectedCitation()) return 0;
if (!segment.fulltext().writeToWebgraph()) return 0;
@ -299,7 +302,10 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
// that means we must search for those entries.
connector.commit(true); // make sure that we have latest information that can be found
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 100000, 60000, 50);
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
0, 100000, 60000, 50);
SolrDocument doc;
String protocol, urlstub, id;
@ -335,6 +341,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
// all processing steps checked, remove the processing tag
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
// send back to index
connector.add(sid);

@ -36,6 +36,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// source information
source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),
@ -51,6 +52,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),
source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"),
@ -85,6 +87,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),
target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),
@ -114,6 +117,23 @@ public enum WebgraphSchema implements SchemaDeclaration {
this.omitNorms = omitNorms;
this.searchable = searchable;
this.comment = comment;
// verify our naming scheme
String name = this.name();
int p = name.indexOf('_');
if (p > 0) {
String ext = name.substring(p + 1);
assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name;
assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name;
assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name;
assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;
assert !ext.equals("val") || (type == SolrType.num_integer && multiValued) : name;
assert !ext.equals("d") || (type == SolrType.num_double && !multiValued) : name;
}
assert type.appropriateName(this) : "bad configuration: " + this.name();
}

@ -391,7 +391,6 @@ public final class TemplateEngine {
// #%
} else if ((bb & 0xFF) == pcChar) { //include
final ByteBuffer include = new ByteBuffer();
keyStream.reset(); //reset stream
if(transferUntil(pis, keyStream, iClose)){
byte[] filename = keyStream.toByteArray();
@ -403,6 +402,7 @@ public final class TemplateEngine {
filename= replacePattern(patternkey, pattern, dflt);
}
if (filename.length > 0 && !java.util.Arrays.equals(filename, dflt)) {
final ByteBuffer include = new ByteBuffer();
BufferedReader br = null;
try{
//br = new BufferedReader(new InputStreamReader(new FileInputStream( filename ))); //Simple Include
@ -422,9 +422,9 @@ public final class TemplateEngine {
structure.append(ASCII.getBytes("<fileinclude file=\"")).append(filename).append(close_tagn);
structure.append(writeTemplate(pis2, out, pattern, dflt, new byte[0])); //clear pattern prefix for include
structure.append(ASCII.getBytes("</fileinclude>\n"));
include.close();
}
}
// # - no special character. This is simply a '#' without meaning
} else { //no match, but a single hash (output # + bb)
out.write(hashChar);

@ -63,7 +63,7 @@ public class serverSwitch
public final File dataPath;
public final File appPath;
protected boolean firstInit;
protected ConcurrentLog log;
public ConcurrentLog log;
protected int serverJobs;
private ConcurrentMap<String, String> configProps;
private final ConcurrentMap<String, String> configRemoved;

Loading…
Cancel
Save