merge with rc1/master

pull/1/head
reger 12 years ago
commit c7c706fd9f

@ -56,14 +56,14 @@
<string>$JAVAROOT/lib/commons-io-2.1.jar</string> <string>$JAVAROOT/lib/commons-io-2.1.jar</string>
<string>$JAVAROOT/lib/commons-jxpath-1.3.jar</string> <string>$JAVAROOT/lib/commons-jxpath-1.3.jar</string>
<string>$JAVAROOT/lib/commons-lang-2.6.jar</string> <string>$JAVAROOT/lib/commons-lang-2.6.jar</string>
<string>$JAVAROOT/lib/commons-logging-1.1.1.jar</string> <string>$JAVAROOT/lib/commons-logging-1.1.3.jar</string>
<string>$JAVAROOT/lib/fontbox-1.8.2.jar</string> <string>$JAVAROOT/lib/fontbox-1.8.2.jar</string>
<string>$JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar</string> <string>$JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar</string>
<string>$JAVAROOT/lib/guava-13.0.1.jar</string> <string>$JAVAROOT/lib/guava-13.0.1.jar</string>
<string>$JAVAROOT/lib/htmllexer.jar</string> <string>$JAVAROOT/lib/htmllexer.jar</string>
<string>$JAVAROOT/lib/httpclient-4.2.5.jar</string> <string>$JAVAROOT/lib/httpclient-4.3.jar</string>
<string>$JAVAROOT/lib/httpcore-4.2.4.jar</string> <string>$JAVAROOT/lib/httpcore-4.3.jar</string>
<string>$JAVAROOT/lib/httpmime-4.2.5.jar</string> <string>$JAVAROOT/lib/httpmime-4.3.jar</string>
<string>$JAVAROOT/lib/icu4j-core.jar</string> <string>$JAVAROOT/lib/icu4j-core.jar</string>
<string>$JAVAROOT/lib/iri-0.8.jar</string> <string>$JAVAROOT/lib/iri-0.8.jar</string>
<string>$JAVAROOT/lib/J7Zip-modified.jar</string> <string>$JAVAROOT/lib/J7Zip-modified.jar</string>

@ -90,6 +90,9 @@ clickdepth_i
## needed (post-)processing steps on this metadata set ## needed (post-)processing steps on this metadata set
process_sxt process_sxt
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
harvestkey_s
### optional but highly recommended values, part of the index distribution process ### optional but highly recommended values, part of the index distribution process

@ -26,6 +26,9 @@ collection_sxt
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation. ## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
#process_sxt #process_sxt
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
harvestkey_s
## ##
@ -71,6 +74,10 @@ source_id_s
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source) ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
#source_clickdepth_i #source_clickdepth_i
## copy of the citation rank norm value from the source link
source_cr_host_norm_i
## host of the url (source) ## host of the url (source)
#source_host_s #source_host_s
@ -168,6 +175,10 @@ target_path_folders_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target) ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
#target_clickdepth_i #target_clickdepth_i
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
target_cr_host_norm_i
## host of the url (target) ## host of the url (target)
#target_host_s #target_host_s

@ -772,7 +772,7 @@ search.result.show.tags = false
# search navigators: comma-separated list of default values for search navigation. # search navigators: comma-separated list of default values for search navigation.
# can be temporary different if search string is given with differen navigation values # can be temporary different if search string is given with differen navigation values
# assigning no value(s) means that no navigation is shown # assigning no value(s) means that no navigation is shown
search.navigation=hosts,authors,namespace,topics,filetype,protocol search.navigation=location,hosts,authors,namespace,topics,filetype,protocol
# search result verification and snippet fetch caching rules # search result verification and snippet fetch caching rules
# each search result can be verified byloading the link from the web # each search result can be verified byloading the link from the web

@ -105,6 +105,7 @@ public class CrawlProfileEditor_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p); if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here // delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes()); sb.crawler.removeActive(handle.getBytes());
sb.crawler.removePassive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final SpaceExceededException e) { } catch (final SpaceExceededException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);

@ -129,6 +129,7 @@ public class Crawler_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p); if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here // delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes()); sb.crawler.removeActive(handle.getBytes());
sb.crawler.removePassive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final SpaceExceededException e) { } catch (final SpaceExceededException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);

@ -316,10 +316,18 @@ public class DictionaryLoader_p {
} }
// check status again // check status again
boolean keepPlacesTagging = false;
for (final LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) { for (final LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) {
prop.put(dictionary.nickname + "Status", dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0); int newstatus = dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0;
if (newstatus == 1) keepPlacesTagging = true;
prop.put(dictionary.nickname + "Status", newstatus);
} }
// if all locations are deleted or deactivated, remove also the vocabulary
if (!keepPlacesTagging) {
LibraryProvider.autotagging.removePlaces();
}
return prop; // return rewrite values for templates return prop; // return rewrite values for templates
} }
} }

@ -635,7 +635,8 @@ public class IndexControlRWIs_p {
"",//userAgent "",//userAgent
false, false,
false, false,
0.0d, 0.0d, 0.0d); 0.0d, 0.0d, 0.0d,
new String[0]);
final SearchEvent theSearch = SearchEventCache.getEvent(query, sb.peers, sb.tables, null, false, sb.loader, Integer.MAX_VALUE, Long.MAX_VALUE, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); final SearchEvent theSearch = SearchEventCache.getEvent(query, sb.peers, sb.tables, null, false, sb.loader, Integer.MAX_VALUE, Long.MAX_VALUE, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));
if (theSearch.rwiProcess != null && theSearch.rwiProcess.isAlive()) try {theSearch.rwiProcess.join();} catch (final InterruptedException e) {} if (theSearch.rwiProcess != null && theSearch.rwiProcess.isAlive()) try {theSearch.rwiProcess.join();} catch (final InterruptedException e) {}
if (theSearch.local_rwi_available.get() == 0) { if (theSearch.local_rwi_available.get() == 0) {

@ -69,7 +69,10 @@ public class IndexCreateQueues_p {
entry = sb.crawler.getActive(handle); entry = sb.crawler.getActive(handle);
final String name = entry.name(); final String name = entry.name();
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(name)) continue; if (CrawlSwitchboard.DEFAULT_PROFILES.contains(name)) continue;
if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes()); if (compiledPattern.matcher(name).find()) {
sb.crawler.removeActive(entry.handle().getBytes());
sb.crawler.removePassive(entry.handle().getBytes());
}
} }
} else { } else {
// iterating through the list of URLs // iterating through the list of URLs

@ -251,7 +251,8 @@ public final class search {
false, false,
0.0d, 0.0d,
0.0d, 0.0d,
0.0d 0.0d,
new String[0]
); );
Network.log.info("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links"); Network.log.info("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links");
@ -315,7 +316,8 @@ public final class search {
false, false,
0.0d, 0.0d,
0.0d, 0.0d,
0.0d 0.0d,
new String[0]
); );
Network.log.info("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links"); Network.log.info("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links");
EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()), "")); EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()), ""));

@ -668,7 +668,8 @@ public class yacysearch {
&& sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false)
&& sb.peers.mySeed().getFlagAcceptRemoteIndex(), && sb.peers.mySeed().getFlagAcceptRemoteIndex(),
false, false,
lat, lon, rad); lat, lon, rad,
sb.getConfig("search_navigation","").split(","));
EventTracker.delete(EventTracker.EClass.SEARCH); EventTracker.delete(EventTracker.EClass.SEARCH);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
theQuery.id(true), theQuery.id(true),

@ -26,6 +26,7 @@ import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.federate.opensearch.SRURSSConnector; import net.yacy.cora.federate.opensearch.SRURSSConnector;
import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
@ -93,7 +94,7 @@ public class yacysearch_location {
// get a queue of search results // get a queue of search results
final String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss"; final String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
final BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>(); final BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, lon == 0.0d && lat == 0.0d ? query : query + " /radius/" + lat + "/" + lon + "/" + radius, maximumTime, Integer.MAX_VALUE, null, false, null); SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, lon == 0.0d && lat == 0.0d ? query : query + " /radius/" + lat + "/" + lon + "/" + radius, maximumTime, Integer.MAX_VALUE, null, false, ClientIdentification.yacyInternetCrawlerAgent);
// take the results and compute some locations // take the results and compute some locations
RSSMessage message; RSSMessage message;

@ -387,7 +387,8 @@ public class yacysearchtrailer {
// category: location search // category: location search
// show only if there is a location database present and if there had been any search results // show only if there is a location database present and if there had been any search results
if (LibraryProvider.geoLoc.isEmpty() || theSearch.getResultCount() == 0) { if ((LibraryProvider.geoLoc.isEmpty() || theSearch.getResultCount() == 0) &&
(theSearch.locationNavigator == null || theSearch.locationNavigator.isEmpty())) {
prop.put("cat-location", 0); prop.put("cat-location", 0);
} else { } else {
prop.put("cat-location", 1); prop.put("cat-location", 1);

Binary file not shown.

Binary file not shown.

@ -22,8 +22,8 @@ commons-io-2.1.jar
commons-lang-2.6.jar commons-lang-2.6.jar
geronimo-stax-api_1.0_spec-1.0.1.jar geronimo-stax-api_1.0_spec-1.0.1.jar
guava-r05.jar guava-r05.jar
httpclient-4.2.3.jar httpclient-4.3.jar
httpcore-4.2.3.jar httpcore-4.3.jar
jcl-over-slf4j-1.6.1.jar jcl-over-slf4j-1.6.1.jar
log4j-over-slf4j-1.6.1.jar log4j-over-slf4j-1.6.1.jar
lucene-analyzers-3.6.0.jar lucene-analyzers-3.6.0.jar

Binary file not shown.

Binary file not shown.

@ -1,240 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
=========================================================================
This project contains annotations in the package org.apache.http.annotation
which are derived from JCIP-ANNOTATIONS
Copyright (c) 2005 Brian Goetz and Tim Peierls.
See http://www.jcip.net and the Creative Commons Attribution License
(http://creativecommons.org/licenses/by/2.5)
Full text: http://creativecommons.org/licenses/by/2.5/legalcode
License
THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
1. Definitions
"Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.
"Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered a Derivative Work for the purpose of this License.
"Licensor" means the individual or entity that offers the Work under the terms of this License.
"Original Author" means the individual or entity who created the Work.
"Work" means the copyrightable work of authorship offered under the terms of this License.
"You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.
3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:
to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;
to create and reproduce Derivative Works;
to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;
to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.
For the avoidance of doubt, where the work is a musical composition:
Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.
Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights agency or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work ("cover version") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).
Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).
The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.
4. Restrictions.The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any credit as required by clause 4(b), as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any credit as required by clause 4(b), as requested.
If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or (ii) if the Original Author and/or Licensor designate another party or parties (e.g. a sponsor institute, publishing entity, journal) for attribution in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.
5. Representations, Warranties and Disclaimer
UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
7. Termination
This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
8. Miscellaneous
Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.

Binary file not shown.

@ -1,202 +1,176 @@
Apache License
Apache License Version 2.0, January 2004
Version 2.0, January 2004 http://www.apache.org/licenses/
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common
other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition,
control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the
"control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or
direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation
including but not limited to software source code, documentation source, and configuration files.
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but
transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
not limited to compiled object code, generated documentation, and conversions to other media types.
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a
Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work
copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the
form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications
editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes
represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain
of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of,
separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions
the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally
to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner
submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of
or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted"
the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent
means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to
to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems,
communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the
and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but
Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise
excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and
on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual,
this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable
worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of,
copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the
publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual,
this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable
worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made,
(except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work,
use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable
where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their
by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s)
Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You
with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a
institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work
cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct
or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses
or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate
granted to You under this License for that Work shall terminate as of the date such litigation is filed.
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without
Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You
modifications, and in Source or Object form, provided that You meet the following conditions:
meet the following conditions:
(a) You must give any other recipients of the Work or
(a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
(b) You must cause any modified files to carry prominent notices stating that You changed the files; and
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
(c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and
that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work,
attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of
excluding those notices that do not pertain to any part of the Derivative Works; and
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
(d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must
distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained
include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not
within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one
pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed
of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or
as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or,
documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and
within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents
wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and
of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution
do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside
notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided
or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed
that such additional attribution notices cannot be construed as modifying the License.
as modifying the License.
You may add Your own copyright statement to Your modifications and
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions
may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or
for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use,
for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with
reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work
any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of
by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions.
this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify
Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed
the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor,
names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the
except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each
agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions
implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the
PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any
appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise,
whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly
unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be
negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special,
liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a
incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the
result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill,
Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all
work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor
other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer,
the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity,
and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this
or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only
License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf
on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify,
of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability
defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason
incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Binary file not shown.

Binary file not shown.

@ -0,0 +1,176 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

Binary file not shown.

@ -73,7 +73,7 @@
<compilation-unit> <compilation-unit>
<package-root>source</package-root> <package-root>source</package-root>
<package-root>htroot</package-root> <package-root>htroot</package-root>
<classpath mode="compile">lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.1.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-13.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.2.3.jar;lib/httpcore-4.2.3.jar;lib/httpmime-4.2.3.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.7.1.jar;lib/jena-2.6.4.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.17.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.2.1.jar;lib/lucene-analyzers-phonetic-4.2.1.jar;lib/lucene-core-4.2.1.jar;lib/lucene-misc-4.2.1.jar;lib/lucene-spatial-4.2.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.1.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.7.2.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.2.1.jar;lib/solr-solrj-4.2.1.jar;lib/spatial4j-0.3.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.5.jar</classpath> <classpath mode="compile">lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.3.jar;lib/fontbox-1.7.1.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-13.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.3.jar;lib/httpcore-4.3.jar;lib/httpmime-4.3.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.7.1.jar;lib/jena-2.6.4.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.17.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.2.1.jar;lib/lucene-analyzers-phonetic-4.2.1.jar;lib/lucene-core-4.2.1.jar;lib/lucene-misc-4.2.1.jar;lib/lucene-spatial-4.2.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.1.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.7.2.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.2.1.jar;lib/solr-solrj-4.2.1.jar;lib/spatial4j-0.3.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.5.jar</classpath>
<source-level>1.6</source-level> <source-level>1.6</source-level>
</compilation-unit> </compilation-unit>
</java-data> </java-data>

@ -24,10 +24,10 @@
package net.yacy.cora.document.encoding; package net.yacy.cora.document.encoding;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Comparator; import java.util.Comparator;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.content.StringBody; import org.apache.http.entity.mime.content.StringBody;
/** /**
@ -45,6 +45,7 @@ public class UTF8 implements Comparator<String> {
static { static {
charset = Charset.forName("UTF-8"); charset = Charset.forName("UTF-8");
} }
private final static ContentType contentType = ContentType.TEXT_PLAIN.withCharset(charset);
public static final UTF8 insensitiveUTF8Comparator = new UTF8(true); public static final UTF8 insensitiveUTF8Comparator = new UTF8(true);
public static final UTF8 identityUTF8Comparator = new UTF8(false); public static final UTF8 identityUTF8Comparator = new UTF8(false);
@ -103,12 +104,7 @@ public class UTF8 implements Comparator<String> {
} }
public final static StringBody StringBody(final String s) { public final static StringBody StringBody(final String s) {
try { return new StringBody(s == null ? "" : s, contentType);
return new StringBody(s == null ? "" : s, charset);
} catch (final UnsupportedEncodingException e) {
e.printStackTrace();
return null;
}
} }
/** /**

@ -48,7 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
public class DigestURL extends MultiProtocolURL implements Serializable { public class DigestURL extends MultiProtocolURL implements Serializable {
private static final long serialVersionUID = -1173233022912141885L; private static final long serialVersionUID = -1173233022912141885L;
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
// class variables // class variables
private byte[] hash; private byte[] hash;

@ -24,6 +24,8 @@ import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.openjena.atlas.logging.Log;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
@ -75,16 +77,22 @@ public class Ranking {
* @param boostDef the definition string * @param boostDef the definition string
*/ */
public void updateBoosts(String boostDef) { public void updateBoosts(String boostDef) {
// call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0" // call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description_txt^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
if (boostDef == null || boostDef.length() == 0) return; if (boostDef == null || boostDef.length() == 0) return;
String[] bf = CommonPattern.COMMA.split(boostDef); String[] bf = CommonPattern.COMMA.split(boostDef);
this.fieldBoosts.clear(); this.fieldBoosts.clear();
for (String boost: bf) { for (String boost: bf) {
int p = boost.indexOf('^'); int p = boost.indexOf('^');
if (p < 0) continue; if (p < 0) continue;
CollectionSchema field = CollectionSchema.valueOf(boost.substring(0, p)); String boostkey = boost.substring(0, p);
Float factor = Float.parseFloat(boost.substring(p + 1)); try {
this.fieldBoosts.put(field, factor); CollectionSchema field = CollectionSchema.valueOf(boostkey);
Float factor = Float.parseFloat(boost.substring(p + 1));
this.fieldBoosts.put(field, factor);
} catch (IllegalArgumentException e) {
// boostkey is unknown; ignore it but print warning
Log.warn("Ranking", "unknwon boost key '" + boostkey + "'");
}
} }
} }

@ -33,6 +33,7 @@ import java.util.Set;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
@ -78,6 +79,34 @@ public class SchemaConfiguration extends Configuration implements Serializable {
} }
} }
/**
* Convert a SolrDocument to a SolrInputDocument.
* This is useful if a document from the search index shall be modified and indexed again.
* This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process.
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(final SolrDocument doc, Set<String> omitFields) {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
}
}
return sid;
}
public SolrDocument toSolrDocument(final SolrInputDocument doc, Set<String> omitFields) {
SolrDocument sd = new SolrDocument();
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && (omitFields == null || !omitFields.contains(field.getName()))) { // check each field if enabled in local Solr schema
sd.setField(field.getName(), field.getValue());
}
}
return sd;
}
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) { public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
boolean changed = false; boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT // FIND OUT IF THIS IS A DOUBLE DOCUMENT

@ -46,24 +46,17 @@ import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache; import org.apache.http.client.AuthCache;
import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.impl.auth.BasicScheme; import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.HttpContext; import org.apache.http.protocol.HttpContext;
import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.impl.HttpSolrServer;
@SuppressWarnings("deprecation") //TODO: switch to 4.3-Stuff
public class RemoteInstance implements SolrInstance { public class RemoteInstance implements SolrInstance {
private String solrurl; private String solrurl;
private final DefaultHttpClient client; private final org.apache.http.impl.client.DefaultHttpClient client;
// 4.3 private final CloseableHttpClient client;
private final String defaultCoreName; private final String defaultCoreName;
private final HttpSolrServer defaultServer; private final HttpSolrServer defaultServer;
private final Collection<String> coreNames; private final Collection<String> coreNames;
@ -133,25 +126,73 @@ public class RemoteInstance implements SolrInstance {
} }
} }
if (solraccount.length() > 0) { if (solraccount.length() > 0) {
PoolingClientConnectionManager cm = new PoolingClientConnectionManager(); // try also: ThreadSafeClientConnManager // 4.3:
// final PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
// cm.setMaxTotal(100);
//
// final RequestConfig.Builder reqBuilder = RequestConfig.custom();
// reqBuilder.setSocketTimeout(timeout);
// reqBuilder.setConnectTimeout(timeout);
// reqBuilder.setConnectionRequestTimeout(timeout);
//
// final BasicCredentialsProvider credsProvider = new BasicCredentialsProvider();
// credsProvider.setCredentials(new AuthScope(host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(solraccount, solrpw));
//
// final HttpClientBuilder builder = HttpClientBuilder.create();
// builder.setConnectionManager(cm);
// builder.setDefaultRequestConfig(reqBuilder.build());
// builder.setDefaultCredentialsProvider(credsProvider);
// builder.disableAutomaticRetries(); // no retries needed; we expect connections to fail; therefore we should not retry
// // ask for gzip - why not using net.yacy.cora.protocol.http.GzipRequestInterceptor?
// builder.addInterceptorLast(new HttpRequestInterceptor() {
// @Override
// public void process(final HttpRequest request, final HttpContext context) throws IOException {
// if (!request.containsHeader("Accept-Encoding")) request.addHeader("Accept-Encoding", "gzip");
// if (!request.containsHeader("Connection")) request.addHeader("Connection", "close"); // prevent CLOSE_WAIT
// }
//
// });
// // uncompress gzip - why not using net.yacy.cora.protocol.http.GzipResponseInterceptor?
// builder.addInterceptorLast(new HttpResponseInterceptor() {
// @Override
// public void process(final HttpResponse response, final HttpContext context) throws IOException {
// HttpEntity entity = response.getEntity();
// if (entity != null) {
// Header ceheader = entity.getContentEncoding();
// if (ceheader != null) {
// HeaderElement[] codecs = ceheader.getElements();
// for (HeaderElement codec : codecs) {
// if (codec.getName().equalsIgnoreCase("gzip")) {
// response.setEntity(new GzipDecompressingEntity(response.getEntity()));
// return;
// }
// }
// }
// }
// }
// });
// this.client = builder.build();
// old Stuff START
org.apache.http.impl.conn.PoolingClientConnectionManager cm = new org.apache.http.impl.conn.PoolingClientConnectionManager(); // try also: ThreadSafeClientConnManager
cm.setMaxTotal(100); cm.setMaxTotal(100);
this.client = new DefaultHttpClient(cm) { this.client = new org.apache.http.impl.client.DefaultHttpClient(cm) {
@Override @Override
protected HttpContext createHttpContext() { protected HttpContext createHttpContext() {
HttpContext context = super.createHttpContext(); HttpContext context = super.createHttpContext();
AuthCache authCache = new BasicAuthCache(); AuthCache authCache = new org.apache.http.impl.client.BasicAuthCache();
BasicScheme basicAuth = new BasicScheme(); BasicScheme basicAuth = new BasicScheme();
HttpHost targetHost = new HttpHost(u.getHost(), u.getPort(), u.getProtocol()); HttpHost targetHost = new HttpHost(u.getHost(), u.getPort(), u.getProtocol());
authCache.put(targetHost, basicAuth); authCache.put(targetHost, basicAuth);
context.setAttribute(ClientContext.AUTH_CACHE, authCache); context.setAttribute(org.apache.http.client.protocol.ClientContext.AUTH_CACHE, authCache);
this.setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false)); // no retries needed; we expect connections to fail; therefore we should not retry this.setHttpRequestRetryHandler(new org.apache.http.impl.client.DefaultHttpRequestRetryHandler(0, false)); // no retries needed; we expect connections to fail; therefore we should not retry
return context; return context;
} }
}; };
HttpParams params = this.client.getParams(); org.apache.http.params.HttpParams params = this.client.getParams();
HttpConnectionParams.setConnectionTimeout(params, timeout); org.apache.http.params.HttpConnectionParams.setConnectionTimeout(params, timeout);
HttpConnectionParams.setSoTimeout(params, timeout); org.apache.http.params.HttpConnectionParams.setSoTimeout(params, timeout);
this.client.addRequestInterceptor(new HttpRequestInterceptor() { this.client.addRequestInterceptor(new HttpRequestInterceptor() {
@Override @Override
public void process(final HttpRequest request, final HttpContext context) throws IOException { public void process(final HttpRequest request, final HttpContext context) throws IOException {
@ -178,9 +219,10 @@ public class RemoteInstance implements SolrInstance {
} }
} }
}); });
BasicCredentialsProvider credsProvider = new BasicCredentialsProvider(); org.apache.http.impl.client.BasicCredentialsProvider credsProvider = new org.apache.http.impl.client.BasicCredentialsProvider();
credsProvider.setCredentials(new AuthScope(host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(solraccount, solrpw)); credsProvider.setCredentials(new AuthScope(host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(solraccount, solrpw));
this.client.setCredentialsProvider(credsProvider); this.client.setCredentialsProvider(credsProvider);
// old Stuff END
} else { } else {
this.client = null; this.client = null;
} }
@ -248,7 +290,14 @@ public class RemoteInstance implements SolrInstance {
@Override @Override
public void close() { public void close() {
if (this.client != null) this.client.getConnectionManager().shutdown(); if (this.client != null) this.client.getConnectionManager().shutdown();
// 4.3
// if (this.client != null)
// try {
// this.client.close();
// } catch (final IOException e) {
// // TODO Auto-generated catch block
// }
} }
} }

@ -110,7 +110,7 @@ public class AutotaggingLibrary {
} }
public void addPlaces(Locations locations) { public void addPlaces(Locations locations) {
if (locations.isEmpty()) return; // otherwise we get a navigation that does nothing if (locations.isEmpty()) return; // otherwise we get a navigation that does nothing
Tagging voc = new Tagging("Locations", locations); Tagging voc = new Tagging("Locations", locations);
try { try {
voc.setObjectspace("http://dbpedia.org/resource/"); voc.setObjectspace("http://dbpedia.org/resource/");
@ -122,6 +122,10 @@ public class AutotaggingLibrary {
} }
} }
public void removePlaces() {
this.vocabularies.remove("Locations");
}
public int size() { public int size() {
return this.vocabularies.size(); return this.vocabularies.size();
} }

@ -28,6 +28,7 @@ package net.yacy.cora.protocol;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.MIME; import org.apache.http.entity.mime.MIME;
import org.apache.http.entity.mime.content.AbstractContentBody; import org.apache.http.entity.mime.content.AbstractContentBody;
@ -42,7 +43,7 @@ public class ByteArrayBody extends AbstractContentBody {
* @param filename * @param filename
*/ */
public ByteArrayBody(final byte[] bytes, final String filename) { public ByteArrayBody(final byte[] bytes, final String filename) {
super("application/octet-stream"); super(ContentType.APPLICATION_OCTET_STREAM);
this.bytes = bytes; this.bytes = bytes;
this.filename = filename; this.filename = filename;
} }

@ -74,8 +74,8 @@ public class Domains {
private static final String PRESENT = ""; private static final String PRESENT = "";
private static final Pattern LOCAL_PATTERNS = Pattern.compile("(10\\..*)|(127\\..*)|(172\\.(1[6-9]|2[0-9]|3[0-1])\\..*)|(169\\.254\\..*)|(192\\.168\\..*)|(localhost)|(\\[?\\:\\:1/.*)|(\\[?fc.*)|(\\[?fd.*)|(\\[?(fe80|0)\\:0\\:0\\:0\\:0\\:0\\:0\\:1.*)"); private static final Pattern LOCAL_PATTERNS = Pattern.compile("(10\\..*)|(127\\..*)|(172\\.(1[6-9]|2[0-9]|3[0-1])\\..*)|(169\\.254\\..*)|(192\\.168\\..*)|(localhost)|(\\[?\\:\\:1/.*)|(\\[?fc.*)|(\\[?fd.*)|(\\[?(fe80|0)\\:0\\:0\\:0\\:0\\:0\\:0\\:1.*)");
private static final int MAX_NAME_CACHE_HIT_SIZE = 100000; private static final int MAX_NAME_CACHE_HIT_SIZE = 10000;
private static final int MAX_NAME_CACHE_MISS_SIZE = 100000; private static final int MAX_NAME_CACHE_MISS_SIZE = 1000;
private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() * 2; private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() * 2;
// a dns cache // a dns cache
@ -782,7 +782,7 @@ public class Domains {
public InetAddress call() throws Exception { public InetAddress call() throws Exception {
return InetAddress.getByName(host); return InetAddress.getByName(host);
} }
}, 1000L, TimeUnit.MILLISECONDS, false); }, 3000L, TimeUnit.MILLISECONDS, false);
//ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone //ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
} }
//.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms"); //.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms");

@ -28,12 +28,12 @@ package net.yacy.cora.protocol.http;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.io.UnsupportedEncodingException; import java.net.InetAddress;
import java.net.UnknownHostException;
import java.security.KeyManagementException; import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException; import java.security.cert.CertificateException;
import java.security.cert.X509Certificate; import java.security.cert.X509Certificate;
import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
@ -49,7 +49,6 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ConnectionInfo; import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.http.ProxySettings.Protocol;
import org.apache.http.Header; import org.apache.http.Header;
import org.apache.http.HeaderElement; import org.apache.http.HeaderElement;
@ -58,41 +57,32 @@ import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest; import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion; import org.apache.http.client.config.CookieSpecs;
import org.apache.http.auth.AuthScope; import org.apache.http.client.config.RequestConfig;
import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpHead; import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.params.CookiePolicy; import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.params.HttpClientParams; import org.apache.http.config.Registry;
import org.apache.http.client.protocol.ClientContext; import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.ClientConnectionManager; import org.apache.http.config.SocketConfig;
import org.apache.http.conn.ConnectionKeepAliveStrategy; import org.apache.http.conn.ConnectionKeepAliveStrategy;
import org.apache.http.conn.params.ConnRouteParams; import org.apache.http.conn.DnsResolver;
import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.conn.routing.HttpRoute; import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.InputStreamEntity; import org.apache.http.entity.InputStreamEntity;
import org.apache.http.entity.mime.MultipartEntity; import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.ContentBody; import org.apache.http.entity.mime.content.ContentBody;
import org.apache.http.entity.mime.content.StringBody; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicHeader; import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicHeaderElementIterator; import org.apache.http.message.BasicHeaderElementIterator;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HTTP; import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext; import org.apache.http.protocol.HttpContext;
import org.apache.http.util.ByteArrayBuffer; import org.apache.http.util.ByteArrayBuffer;
@ -108,106 +98,126 @@ import org.apache.http.util.EntityUtils;
public class HTTPClient { public class HTTPClient {
private final static int maxcon = 200; private final static int maxcon = 200;
private static IdledConnectionEvictor idledConnectionEvictor = null; private static IdleConnectionMonitorThread connectionMonitor = null;
private static HttpClient httpClient = initConnectionManager(); private final static RequestConfig dfltReqConf = initRequestConfig();
private static final CredentialsProvider credsProvider = new BasicCredentialsProvider(); private final static HttpClientBuilder clientBuilder = initClientBuilder();
private final RequestConfig.Builder reqConfBuilder;
private Set<Entry<String, String>> headers = null; private Set<Entry<String, String>> headers = null;
private HttpResponse httpResponse = null; private CloseableHttpResponse httpResponse = null;
private HttpUriRequest currentRequest = null; private HttpUriRequest currentRequest = null;
private long upbytes = 0L; private long upbytes = 0L;
private int timeout = 10000;
private ClientIdentification.Agent agent = null;
private String host = null; private String host = null;
private boolean redirecting = true;
private String realm = null; private String realm = null;
public HTTPClient(final ClientIdentification.Agent agent) { public HTTPClient(final ClientIdentification.Agent agent) {
super(); super();
this.agent = agent; clientBuilder.setUserAgent(agent.userAgent);
this.timeout = agent.clientTimeout; reqConfBuilder = RequestConfig.copy(dfltReqConf);
HttpProtocolParams.setUserAgent(httpClient.getParams(), agent.userAgent); reqConfBuilder.setSocketTimeout(agent.clientTimeout);
reqConfBuilder.setConnectTimeout(agent.clientTimeout);
reqConfBuilder.setConnectionRequestTimeout(agent.clientTimeout);
} }
public HTTPClient(final ClientIdentification.Agent agent, final int timeout) { public HTTPClient(final ClientIdentification.Agent agent, final int timeout) {
super(); super();
this.agent = agent; clientBuilder.setUserAgent(agent.userAgent);
this.timeout = timeout; reqConfBuilder = RequestConfig.copy(dfltReqConf);
HttpProtocolParams.setUserAgent(httpClient.getParams(), agent.userAgent); reqConfBuilder.setSocketTimeout(timeout);
reqConfBuilder.setConnectTimeout(timeout);
reqConfBuilder.setConnectionRequestTimeout(timeout);
} }
public static void setDefaultUserAgent(final String defaultAgent) { public static void setDefaultUserAgent(final String defaultAgent) {
HttpProtocolParams.setUserAgent(httpClient.getParams(), defaultAgent); clientBuilder.setUserAgent(defaultAgent);
} }
public static HttpClient initConnectionManager() { private static RequestConfig initRequestConfig() {
// Create and initialize scheme registry final RequestConfig.Builder builder = RequestConfig.custom();
final SchemeRegistry schemeRegistry = new SchemeRegistry(); // IMPORTANT - if not set to 'false' then servers do not process the request until a time-out of 2 seconds
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); builder.setExpectContinueEnabled(false);
schemeRegistry.register(new Scheme("https", 443, getSSLSocketFactory()));
final PoolingClientConnectionManager clientConnectionManager = new PoolingClientConnectionManager(schemeRegistry);
// Create and initialize HTTP parameters
final HttpParams httpParams = new BasicHttpParams();
/**
* ConnectionManager settings
*/
// how much connections do we need? - default: 20
clientConnectionManager.setMaxTotal(maxcon);
// for statistics same value should also be set here
ConnectionInfo.setMaxcount(maxcon);
// connections per host (2 default)
clientConnectionManager.setDefaultMaxPerRoute(2);
// Increase max connections for localhost
final HttpHost localhost = new HttpHost(Domains.LOCALHOST);
clientConnectionManager.setMaxPerRoute(new HttpRoute(localhost), maxcon);
/**
* HTTP protocol settings
*/
HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1);
// UserAgent
HttpProtocolParams.setUserAgent(httpParams, ClientIdentification.yacyInternetCrawlerAgent.userAgent);
HttpProtocolParams.setUseExpectContinue(httpParams, false); // IMPORTANT - if not set to 'false' then servers do not process the request until a time-out of 2 seconds
/**
* HTTP connection settings
*/
// timeout in milliseconds until a connection is established in milliseconds // timeout in milliseconds until a connection is established in milliseconds
HttpConnectionParams.setConnectionTimeout(httpParams, 6000); builder.setConnectionRequestTimeout(6000);
// SO_LINGER affects the socket close operation in seconds builder.setConnectTimeout(8000);
// HttpConnectionParams.setLinger(httpParams, 6);
// HttpConnectionParams.setSocketBufferSize(httpParams, 8192);
// SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds // SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds
HttpConnectionParams.setSoTimeout(httpParams, 1000); builder.setSocketTimeout(3000);
// getting an I/O error when executing a request over a connection that has been closed at the server side // getting an I/O error when executing a request over a connection that has been closed at the server side
HttpConnectionParams.setStaleCheckingEnabled(httpParams, true); builder.setStaleConnectionCheckEnabled(true);
// conserve bandwidth by minimizing the number of segments that are sent
HttpConnectionParams.setTcpNoDelay(httpParams, false);
// Defines whether the socket can be bound even though a previous connection is still in a timeout state.
HttpConnectionParams.setSoReuseaddr(httpParams, true);
/**
* HTTP client settings
*/
// ignore cookies, cause this may cause segfaults in default cookiestore and is not needed // ignore cookies, cause this may cause segfaults in default cookiestore and is not needed
HttpClientParams.setCookiePolicy(httpParams, CookiePolicy.IGNORE_COOKIES); builder.setCookieSpec(CookieSpecs.IGNORE_COOKIES);
builder.setRedirectsEnabled(true);
httpClient = new DefaultHttpClient(clientConnectionManager, httpParams); builder.setRelativeRedirectsAllowed(true);
return builder.build();
}
private static HttpClientBuilder initClientBuilder() {
final HttpClientBuilder builder = HttpClientBuilder.create();
builder.setConnectionManager(initPoolingConnectionManager());
builder.setDefaultRequestConfig(dfltReqConf);
// UserAgent
builder.setUserAgent(ClientIdentification.yacyInternetCrawlerAgent.userAgent);
// remove retries; we expect connections to fail; therefore we should not retry
builder.disableAutomaticRetries();
// disable the cookiestore, cause this may cause segfaults and is not needed // disable the cookiestore, cause this may cause segfaults and is not needed
((DefaultHttpClient) httpClient).setCookieStore(null); builder.setDefaultCookieStore(null);
builder.disableCookieManagement();
// add cutom keep alive strategy // add cutom keep alive strategy
addCustomKeepAliveStrategy((DefaultHttpClient) httpClient); builder.setKeepAliveStrategy(customKeepAliveStrategy());
// ask for gzip // ask for gzip
((DefaultHttpClient) httpClient).addRequestInterceptor(new GzipRequestInterceptor()); builder.addInterceptorLast(new GzipRequestInterceptor());
// uncompress gzip // uncompress gzip
((DefaultHttpClient) httpClient).addResponseInterceptor(new GzipResponseInterceptor()); builder.addInterceptorLast(new GzipResponseInterceptor());
// remove retries; we expect connections to fail; therefore we should not retry // Proxy
((DefaultHttpClient) httpClient).setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false)); builder.setRoutePlanner(ProxySettings.RoutePlanner);
if (idledConnectionEvictor == null) { builder.setDefaultCredentialsProvider(ProxySettings.CredsProvider);
idledConnectionEvictor = new IdledConnectionEvictor(clientConnectionManager);
idledConnectionEvictor.start(); return builder;
}
private static PoolingHttpClientConnectionManager initPoolingConnectionManager() {
final PlainConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
final Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", plainsf)
.register("https", getSSLSocketFactory())
.build();
final PoolingHttpClientConnectionManager pooling = new PoolingHttpClientConnectionManager(registry, new DnsResolver(){
@Override
public InetAddress[] resolve(final String host0)throws UnknownHostException {
final InetAddress ip = Domains.dnsResolve(host0);
if (ip == null) throw new UnknownHostException(host0);
return new InetAddress[]{ip};
}});
// how much connections do we need? - default: 20
pooling.setMaxTotal(maxcon);
// for statistics same value should also be set here
ConnectionInfo.setMaxcount(maxcon);
// connections per host (2 default)
pooling.setDefaultMaxPerRoute(4);
// Increase max connections for localhost
final HttpHost localhost = new HttpHost(Domains.LOCALHOST);
pooling.setMaxPerRoute(new HttpRoute(localhost), maxcon);
final SocketConfig socketConfig = SocketConfig.custom()
// Defines whether the socket can be bound even though a previous connection is still in a timeout state.
.setSoReuseAddress(true)
// SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds
.setSoTimeout(3000)
// conserve bandwidth by minimizing the number of segments that are sent
.setTcpNoDelay(false)
.build();
pooling.setDefaultSocketConfig(socketConfig);
if (connectionMonitor == null) {
connectionMonitor = new IdleConnectionMonitorThread(pooling);
connectionMonitor.start();
} }
return httpClient;
return pooling;
} }
/** /**
@ -217,34 +227,29 @@ public class HTTPClient {
* @throws InterruptedException * @throws InterruptedException
*/ */
public static void closeConnectionManager() throws InterruptedException { public static void closeConnectionManager() throws InterruptedException {
if (idledConnectionEvictor != null) { if (connectionMonitor != null) {
// Shut down the evictor thread // Shut down the evictor thread
idledConnectionEvictor.shutdown(); connectionMonitor.shutdown();
idledConnectionEvictor.join(); connectionMonitor.join();
} }
if (httpClient != null) {
// Shut down the connection manager
httpClient.getConnectionManager().shutdown();
}
} }
public static void setAuth(final String host, final int port, final String user, final String pw) { // public static void setAuth(final String host, final int port, final String user, final String pw) {
final UsernamePasswordCredentials creds = new UsernamePasswordCredentials(user, pw); // final UsernamePasswordCredentials creds = new UsernamePasswordCredentials(user, pw);
final AuthScope scope = new AuthScope(host, port); // final AuthScope scope = new AuthScope(host, port);
credsProvider.setCredentials(scope, creds); // credsProvider.setCredentials(scope, creds);
httpClient.getParams().setParameter(ClientContext.CREDS_PROVIDER, credsProvider); // httpClient.getParams().setParameter(ClientContext.CREDS_PROVIDER, credsProvider);
} // }
/** // /**
* this method sets a host on which more than the default of 2 router per host are allowed // * this method sets a host on which more than the default of 2 router per host are allowed
* // *
* @param the host to be raised in 'route per host' // * @param the host to be raised in 'route per host'
*/ // */
public static void setMaxRouteHost(final String host) { // public static void setMaxRouteHost(final String host) {
final HttpHost mHost = new HttpHost(host); // final HttpHost mHost = new HttpHost(host);
((PoolingClientConnectionManager) httpClient.getConnectionManager()).setMaxPerRoute(new HttpRoute(mHost), 50); // ((PoolingClientConnectionManager) httpClient.getConnectionManager()).setMaxPerRoute(new HttpRoute(mHost), 50);
} // }
/** /**
* This method sets the Header used for the request * This method sets the Header used for the request
@ -261,7 +266,9 @@ public class HTTPClient {
* @param timeout in milliseconds * @param timeout in milliseconds
*/ */
public void setTimout(final int timeout) { public void setTimout(final int timeout) {
this.timeout = timeout; reqConfBuilder.setSocketTimeout(timeout);
reqConfBuilder.setConnectTimeout(timeout);
reqConfBuilder.setConnectionRequestTimeout(timeout);
} }
/** /**
@ -270,7 +277,7 @@ public class HTTPClient {
* @param userAgent * @param userAgent
*/ */
public void setUserAgent(final ClientIdentification.Agent agent) { public void setUserAgent(final ClientIdentification.Agent agent) {
this.agent = agent; clientBuilder.setUserAgent(agent.userAgent);
} }
/** /**
@ -288,7 +295,8 @@ public class HTTPClient {
* @param redirecting * @param redirecting
*/ */
public void setRedirecting(final boolean redirecting) { public void setRedirecting(final boolean redirecting) {
this.redirecting = redirecting; reqConfBuilder.setRedirectsEnabled(redirecting);
reqConfBuilder.setRelativeRedirectsAllowed(redirecting);
} }
/** /**
@ -354,7 +362,7 @@ public class HTTPClient {
} }
httpGet.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state httpGet.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
if (!localhost) setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service if (!localhost) setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
return getContentBytes(httpGet, url.getHost(), maxBytes); return getContentBytes(httpGet, maxBytes);
} }
/** /**
@ -378,7 +386,7 @@ public class HTTPClient {
httpGet.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state httpGet.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
this.currentRequest = httpGet; this.currentRequest = httpGet;
execute(httpGet, url.getHost()); execute(httpGet);
} }
/** /**
@ -393,7 +401,7 @@ public class HTTPClient {
final HttpHead httpHead = new HttpHead(url.toNormalform(true)); final HttpHead httpHead = new HttpHead(url.toNormalform(true));
httpHead.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state httpHead.addHeader(new BasicHeader("Connection", "close")); // don't keep alive, prevent CLOSE_WAIT state
setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
execute(httpHead, url.getHost()); execute(httpHead);
finish(); finish();
ConnectionInfo.removeConnection(httpHead.hashCode()); ConnectionInfo.removeConnection(httpHead.hashCode());
return this.httpResponse; return this.httpResponse;
@ -422,7 +430,7 @@ public class HTTPClient {
this.upbytes = length; this.upbytes = length;
httpPost.setEntity(inputStreamEntity); httpPost.setEntity(inputStreamEntity);
this.currentRequest = httpPost; this.currentRequest = httpPost;
execute(httpPost, host); execute(httpPost);
} }
/** /**
@ -454,10 +462,11 @@ public class HTTPClient {
setHost(vhost); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service setHost(vhost); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
if (vhost == null) setHost(Domains.LOCALHOST); if (vhost == null) setHost(Domains.LOCALHOST);
final MultipartEntity multipartEntity = new MultipartEntity(); final MultipartEntityBuilder entityBuilder = MultipartEntityBuilder.create();
for (final Entry<String,ContentBody> part : post.entrySet()) for (final Entry<String,ContentBody> part : post.entrySet())
multipartEntity.addPart(part.getKey(), part.getValue()); entityBuilder.addPart(part.getKey(), part.getValue());
final HttpEntity multipartEntity = entityBuilder.build();
// statistics // statistics
this.upbytes = multipartEntity.getContentLength(); this.upbytes = multipartEntity.getContentLength();
@ -467,7 +476,7 @@ public class HTTPClient {
httpPost.setEntity(multipartEntity); httpPost.setEntity(multipartEntity);
} }
return getContentBytes(httpPost, url.getHost(), Integer.MAX_VALUE); return getContentBytes(httpPost, Integer.MAX_VALUE);
} }
/** /**
@ -491,7 +500,7 @@ public class HTTPClient {
// statistics // statistics
this.upbytes = length; this.upbytes = length;
httpPost.setEntity(inputStreamEntity); httpPost.setEntity(inputStreamEntity);
return getContentBytes(httpPost, host, Integer.MAX_VALUE); return getContentBytes(httpPost, Integer.MAX_VALUE);
} }
/** /**
@ -580,9 +589,9 @@ public class HTTPClient {
} }
} }
private byte[] getContentBytes(final HttpUriRequest httpUriRequest, String host, final int maxBytes) throws IOException { private byte[] getContentBytes(final HttpUriRequest httpUriRequest, final int maxBytes) throws IOException {
try { try {
execute(httpUriRequest, host); execute(httpUriRequest);
if (this.httpResponse == null) return null; if (this.httpResponse == null) return null;
// get the response body // get the response body
final HttpEntity httpEntity = this.httpResponse.getEntity(); final HttpEntity httpEntity = this.httpResponse.getEntity();
@ -602,11 +611,13 @@ public class HTTPClient {
} }
} }
private void execute(final HttpUriRequest httpUriRequest, String host) throws IOException { private void execute(final HttpUriRequest httpUriRequest) throws IOException {
final HttpContext httpContext = new BasicHttpContext(); final HttpClientContext context = HttpClientContext.create();
context.setRequestConfig(reqConfBuilder.build());
if (this.host != null)
context.setTargetHost(new HttpHost(this.host));
setHeaders(httpUriRequest); setHeaders(httpUriRequest);
setParams(httpUriRequest.getParams());
setProxy(httpUriRequest.getParams(), host);
// statistics // statistics
storeConnectionInfo(httpUriRequest); storeConnectionInfo(httpUriRequest);
// execute the method; some asserts confirm that that the request can be send with Content-Length and is therefore not terminated by EOF // execute the method; some asserts confirm that that the request can be send with Content-Length and is therefore not terminated by EOF
@ -620,14 +631,17 @@ public class HTTPClient {
} }
Thread.currentThread().setName("HTTPClient-" + httpUriRequest.getURI().getHost()); Thread.currentThread().setName("HTTPClient-" + httpUriRequest.getURI().getHost());
final long time = System.currentTimeMillis();
try { try {
final long time = System.currentTimeMillis(); final CloseableHttpClient client = clientBuilder.build();
this.httpResponse = httpClient.execute(httpUriRequest, httpContext); this.httpResponse = client.execute(httpUriRequest, context);
this.httpResponse.setHeader(HeaderFramework.RESPONSE_TIME_MILLIS, Long.toString(System.currentTimeMillis() - time)); this.httpResponse.setHeader(HeaderFramework.RESPONSE_TIME_MILLIS, Long.toString(System.currentTimeMillis() - time));
} catch (final IOException e) { } catch (final IOException e) {
ConnectionInfo.removeConnection(httpUriRequest.hashCode()); ConnectionInfo.removeConnection(httpUriRequest.hashCode());
httpUriRequest.abort(); httpUriRequest.abort();
throw new IOException("Client can't execute: " + (e.getCause() == null ? e.getMessage() : e.getCause().getMessage())); throw new IOException("Client can't execute: "
+ (e.getCause() == null ? e.getMessage() : e.getCause().getMessage())
+ " duration=" + Long.toString(System.currentTimeMillis() - time));
} }
} }
@ -669,23 +683,6 @@ public class HTTPClient {
httpUriRequest.setHeader("Authorization", "realm=" + this.realm); httpUriRequest.setHeader("Authorization", "realm=" + this.realm);
} }
private void setParams(final HttpParams httpParams) {
HttpClientParams.setRedirecting(httpParams, this.redirecting);
HttpConnectionParams.setConnectionTimeout(httpParams, this.timeout);
HttpConnectionParams.setSoTimeout(httpParams, this.timeout);
if (this.agent != null)
HttpProtocolParams.setUserAgent(httpParams, this.agent.userAgent);
if (this.host != null)
httpParams.setParameter(HTTP.TARGET_HOST, this.host);
}
private static void setProxy(final HttpParams httpParams, String host) {
if (ProxySettings.useForHost(host, Protocol.HTTP))
ConnRouteParams.setDefaultProxy(httpParams, ProxySettings.getProxyHost());
// TODO find a better way for this
ProxySettings.setProxyCreds((DefaultHttpClient) httpClient);
}
private void storeConnectionInfo(final HttpUriRequest httpUriRequest) { private void storeConnectionInfo(final HttpUriRequest httpUriRequest) {
final int port = httpUriRequest.getURI().getPort(); final int port = httpUriRequest.getURI().getPort();
final String thost = httpUriRequest.getURI().getHost(); final String thost = httpUriRequest.getURI().getHost();
@ -699,7 +696,7 @@ public class HTTPClient {
this.upbytes)); this.upbytes));
} }
private static SSLSocketFactory getSSLSocketFactory() { private static SSLConnectionSocketFactory getSSLSocketFactory() {
final TrustManager trustManager = new X509TrustManager() { final TrustManager trustManager = new X509TrustManager() {
@Override @Override
public void checkClientTrusted(final X509Certificate[] chain, final String authType) public void checkClientTrusted(final X509Certificate[] chain, final String authType)
@ -728,7 +725,9 @@ public class HTTPClient {
// e.printStackTrace(); // e.printStackTrace();
} }
final SSLSocketFactory sslSF = new SSLSocketFactory(sslContext, SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); final SSLConnectionSocketFactory sslSF = new SSLConnectionSocketFactory(
sslContext,
SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
return sslSF; return sslSF;
} }
@ -739,8 +738,8 @@ public class HTTPClient {
* *
* @param defaultHttpClient * @param defaultHttpClient
*/ */
private static void addCustomKeepAliveStrategy(final DefaultHttpClient defaultHttpClient) { private static ConnectionKeepAliveStrategy customKeepAliveStrategy() {
defaultHttpClient.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() { return new ConnectionKeepAliveStrategy() {
@Override @Override
public long getKeepAliveDuration(HttpResponse response, HttpContext context) { public long getKeepAliveDuration(HttpResponse response, HttpContext context) {
// Honor 'keep-alive' header // Honor 'keep-alive' header
@ -762,7 +761,7 @@ public class HTTPClient {
// Keep alive for 5 seconds only // Keep alive for 5 seconds only
return 5 * 1000; return 5 * 1000;
} }
}); };
} }
/** /**
@ -773,13 +772,13 @@ public class HTTPClient {
public static void main(final String[] args) { public static void main(final String[] args) {
String url = null; String url = null;
// prepare Parts // prepare Parts
final Map<String,ContentBody> newparts = new LinkedHashMap<String,ContentBody>(); // final Map<String,ContentBody> newparts = new LinkedHashMap<String,ContentBody>();
try { // try {
newparts.put("foo", new StringBody("FooBar")); // newparts.put("foo", new StringBody("FooBar"));
newparts.put("bar", new StringBody("BarFoo")); // newparts.put("bar", new StringBody("BarFoo"));
} catch (final UnsupportedEncodingException e) { // } catch (final UnsupportedEncodingException e) {
System.out.println(e.getStackTrace()); // System.out.println(e.getStackTrace());
} // }
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent); final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
client.setRedirecting(false); client.setRedirecting(false);
// Get some // Get some
@ -805,7 +804,7 @@ public class HTTPClient {
// for (HeaderElement element: header.getElements()) // for (HeaderElement element: header.getElements())
// System.out.println("Element " + element.getName() + " : " + element.getValue()); // System.out.println("Element " + element.getName() + " : " + element.getValue());
} }
System.out.println(client.getHttpResponse().getLocale()); // System.out.println(client.getHttpResponse().getLocale());
System.out.println(client.getHttpResponse().getProtocolVersion()); System.out.println(client.getHttpResponse().getProtocolVersion());
System.out.println(client.getHttpResponse().getStatusLine()); System.out.println(client.getHttpResponse().getStatusLine());
// Post some // Post some
@ -822,49 +821,41 @@ public class HTTPClient {
} }
} }
public static class IdleConnectionMonitorThread extends Thread {
private final HttpClientConnectionManager connMgr;
private volatile boolean shutdown;
public IdleConnectionMonitorThread(HttpClientConnectionManager connMgr) {
super();
this.connMgr = connMgr;
}
/** @Override
* public void run() {
* @see: http://hc.apache.org/httpcomponents-client-4.0.1/tutorial/html/connmgmt.html#d4e638 try {
* while (!shutdown) {
*/ synchronized (this) {
private static class IdledConnectionEvictor extends Thread { wait(5000);
// Close expired connections
private final ClientConnectionManager clientConnectionManager; connMgr.closeExpiredConnections();
// Optionally, close connections
private volatile boolean shutdown; // that have been idle longer than 30 sec
connMgr.closeIdleConnections(30, TimeUnit.SECONDS);
public IdledConnectionEvictor(final ClientConnectionManager clientConnectionManager) { }
super(); }
this.clientConnectionManager = clientConnectionManager; connMgr.shutdown();
} } catch (final InterruptedException ex) {
// terminate
@Override }
public void run() { }
try {
while (!this.shutdown) { public void shutdown() {
synchronized (this) { shutdown = true;
wait(5000); synchronized (this) {
// Close expired connections notifyAll();
this.clientConnectionManager.closeExpiredConnections(); }
// Optionally, close connections }
// that have been idle longer than 5 sec
// (some SOHO router act strange on >5sec idled connections)
this.clientConnectionManager.closeIdleConnections(5, TimeUnit.SECONDS);
}
}
} catch (final InterruptedException ex) {
// terminate
}
}
public void shutdown() {
this.shutdown = true;
synchronized (this) {
notifyAll();
}
}
} }
} }

@ -27,10 +27,16 @@ package net.yacy.cora.protocol.http;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import org.apache.http.HttpException;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.auth.AuthScope; import org.apache.http.auth.AuthScope;
import org.apache.http.auth.Credentials;
import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.impl.client.AbstractHttpClient; import org.apache.http.client.CredentialsProvider;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.routing.HttpRoutePlanner;
import org.apache.http.protocol.HttpContext;
/** /**
* settings for a remote proxy * settings for a remote proxy
@ -71,12 +77,36 @@ public final class ProxySettings {
return new HttpHost(host, port); return new HttpHost(host, port);
} }
public static void setProxyCreds(AbstractHttpClient httpClient) { public static HttpRoutePlanner RoutePlanner = new HttpRoutePlanner() {
if (!use) return;
httpClient.getCredentialsProvider().setCredentials( @Override
new AuthScope(host, port), public HttpRoute determineRoute(HttpHost target, HttpRequest request, HttpContext context) throws HttpException {
new UsernamePasswordCredentials(user, password)); if (use) {
} final Protocol protocol = "https".equalsIgnoreCase(target.getSchemeName())? Protocol.HTTPS : Protocol.HTTP;
if (useForHost(target.getHostName(), protocol))
return new HttpRoute(target, null, getProxyHost(), protocol == Protocol.HTTPS);
}
return new HttpRoute(target); // direct
}
};
public static CredentialsProvider CredsProvider = new CredentialsProvider() {
@Override
public void clear() {
}
@Override
public Credentials getCredentials(AuthScope scope) {
if (host != null && host.equals(scope.getHost()) && port == scope.getPort())
return new UsernamePasswordCredentials(user, password);
return null;
}
@Override
public void setCredentials(AuthScope arg0, Credentials arg1) {
}
};
/** /**
* tell if a remote proxy will be used for the given host * tell if a remote proxy will be used for the given host

@ -42,8 +42,6 @@ import org.openjena.atlas.logging.Log;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
@ -51,7 +49,6 @@ import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency; import net.yacy.crawler.data.Latency;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
@ -293,37 +290,6 @@ public class Balancer {
return map; return map;
} }
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
/** /**
* get lists of crawl request entries for a specific host * get lists of crawl request entries for a specific host
* @param host * @param host
@ -428,13 +394,13 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again // if not: return null. A calling method must handle the null value and try again
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) { if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue; continue;
} }
// depending on the caching policy we need sleep time to avoid DoS-like situations // depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url()); sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes()); assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash()); assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
@ -445,7 +411,7 @@ public class Balancer {
} }
if (crawlEntry == null) return null; if (crawlEntry == null) return null;
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent(); ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent); long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime); Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) { if (delay && sleeptime > 0) {
// force a busy waiting here // force a busy waiting here
@ -515,7 +481,7 @@ public class Balancer {
rowEntry = this.urlFileIndex.get(urlhash, false); rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) continue; // may have been deleted there manwhile if (rowEntry == null) continue; // may have been deleted there manwhile
Request crawlEntry = new Request(rowEntry); Request crawlEntry = new Request(rowEntry);
CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) { if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue; continue;

@ -1,312 +0,0 @@
/**
* CrawlQueue
* Copyright 2013 by Michael Christen
* First released 30.08.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.table.Table;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
public class CrawlQueue {
private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000;
private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
private BufferedObjectIndex urlFileIndex;
private final HandleSet double_push_check;
public CrawlQueue(
final File cachePath,
final String filename,
final boolean useTailCache,
final boolean exceed134217727) {
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
cachePath.mkdirs();
final File f = new File(cachePath, filename);
try {
this.urlFileIndex = new BufferedObjectIndex(new Table(f, Request.rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true), objectIndexBufferSize);
} catch (final SpaceExceededException e) {
try {
this.urlFileIndex = new BufferedObjectIndex(new Table(f, Request.rowdef, 0, 0, false, exceed134217727, true), objectIndexBufferSize);
} catch (final SpaceExceededException e1) {
ConcurrentLog.logException(e1);
}
}
this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
ConcurrentLog.info("CrawlQueue", "opened queue file with " + this.urlFileIndex.size() + " entries from " + f.toString());
}
public synchronized void close() {
if (this.urlFileIndex != null) {
this.urlFileIndex.close();
this.urlFileIndex = null;
}
}
public void clear() {
ConcurrentLog.info("CrawlQueue", "cleaning CrawlQueue with " + this.urlFileIndex.size() + " entries from " + this.urlFileIndex.filename());
try {
this.urlFileIndex.clear();
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
this.double_push_check.clear();
}
public Request get(final byte[] urlhash) throws IOException {
assert urlhash != null;
if (this.urlFileIndex == null) return null; // case occurs during shutdown
final Row.Entry entry = this.urlFileIndex.get(urlhash, false);
if (entry == null) return null;
return new Request(entry);
}
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
// removes all entries with a specific profile hash.
// this may last some time
// returns number of deletions
// first find a list of url hashes that shall be deleted
final HandleSet urlHashes = new RowHandleSet(this.urlFileIndex.row().primaryKeyLength, Base64Order.enhancedCoder, 100);
final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
synchronized (this) {
final Iterator<Row.Entry> i = this.urlFileIndex.rows();
Row.Entry rowEntry;
Request crawlEntry;
while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
rowEntry = i.next();
crawlEntry = new Request(rowEntry);
if (crawlEntry.profileHandle().equals(profileHandle)) {
urlHashes.put(crawlEntry.url().hash());
}
}
}
// then delete all these urls from the queues and the file index
return remove(urlHashes);
}
/**
* this method is only here, because so many import/export methods need it
and it was implemented in the previous architecture
however, usage is not recommended
* @param urlHashes, a list of hashes that shall be removed
* @return number of entries that had been removed
* @throws IOException
*/
public synchronized int remove(final HandleSet urlHashes) throws IOException {
final int s = this.urlFileIndex.size();
int removedCounter = 0;
for (final byte[] urlhash: urlHashes) {
final Row.Entry entry = this.urlFileIndex.remove(urlhash);
if (entry != null) removedCounter++;
// remove from double-check caches
this.double_push_check.remove(urlhash);
}
if (removedCounter == 0) return 0;
assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s;
return removedCounter;
}
public boolean has(final byte[] urlhashb) {
return this.urlFileIndex.has(urlhashb) || this.double_push_check.has(urlhashb);
}
public int size() {
return this.urlFileIndex.size();
}
public boolean isEmpty() {
return this.urlFileIndex.isEmpty();
}
/**
* push a crawl request on the balancer stack
* @param entry
* @return null if this was successful or a String explaining what went wrong in case of an error
* @throws IOException
* @throws SpaceExceededException
*/
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
assert entry != null;
final byte[] hash = entry.url().hash();
synchronized (this) {
// double-check
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
this.double_push_check.put(hash);
// increase dom counter
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
}
// add to index
final int s = this.urlFileIndex.size();
this.urlFileIndex.put(entry.toRow());
assert s < this.urlFileIndex.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size();
assert this.urlFileIndex.has(hash) : "hash = " + ASCII.String(hash);
// add the hash to a queue if the host is unknown to get this fast into the balancer
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
}
robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
return null;
}
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
/**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned
* @param delay true if the requester demands forced delays using explicit thread sleep
* @param profile
* @return a url in a CrawlEntry object
* @throws IOException
* @throws SpaceExceededException
*/
public Request pop(final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
if (this.urlFileIndex.isEmpty()) return null;
long sleeptime = 0;
Request crawlEntry = null;
CrawlProfile profileEntry = null;
while (this.urlFileIndex.size() > 0) {
synchronized (this) {
Row.Entry rowEntry = this.urlFileIndex.removeOne();
if (rowEntry == null) return null;
crawlEntry = new Request(rowEntry);
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("CrawlQueue", "no profile entry for handle " + crawlEntry.profileHandle());
return null;
}
// check blacklist (again) because the user may have created blacklist entries after the queue has been filled
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
ConcurrentLog.fine("CrawlQueue", "URL '" + crawlEntry.url() + "' is in blacklist.");
continue;
}
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("CrawlQueue", "no profile entry for handle " + crawlEntry.profileHandle());
continue;
}
}
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
ConcurrentLog.info("CrawlQueue", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent));
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 3) {
rest = rest + 1000 * loops;
loops = 0;
}
Thread.currentThread().setName("CrawlQueue waiting for " +crawlEntry.url().getHost() + ": " + sleeptime + " milliseconds");
synchronized(this) {
// must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough
if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}}
for (int i = 0; i < loops; i++) {
ConcurrentLog.info("CrawlQueue", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {this.wait(1000); } catch (final InterruptedException e) {}
}
}
Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
}
return crawlEntry;
}
}

@ -48,8 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.retrieval.FTPLoader; import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.HTTPLoader; import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
@ -149,7 +147,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db // if the url was rejected we store it into the error URL db
if (rejectReason != null && !rejectReason.startsWith("double in")) { if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle())); final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
} }
} catch (final Exception e) { } catch (final Exception e) {
@ -294,7 +292,8 @@ public final class CrawlStacker {
public String stackCrawl(final Request entry) { public String stackCrawl(final Request entry) {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle())); byte[] handle = UTF8.getBytes(entry.profileHandle());
final CrawlProfile profile = this.crawler.get(handle);
String error; String error;
if (profile == null) { if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
@ -302,7 +301,9 @@ public final class CrawlStacker {
return error; return error;
} }
error = checkAcceptance(entry.url(), profile, entry.depth()); error = checkAcceptanceChangeable(entry.url(), profile, entry.depth());
if (error != null) return error;
error = checkAcceptanceInitially(entry.url(), profile);
if (error != null) return error; if (error != null) return error;
// store information // store information
@ -366,53 +367,16 @@ public final class CrawlStacker {
return null; return null;
} }
public String checkAcceptance(final DigestURL url, final CrawlProfile profile, final int depth) { /**
* Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl
* These tests are incomplete and must be followed with an checkAcceptanceChangeable - test.
* @param url
* @param profile
* @return null if the url is accepted, an error string in case if the url is not accepted with an error description
*/
public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();
final String urlstring = url.toString(); final String urlstring = url.toString();
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
this.log.fine("URL '" + urlstring + "' is in blacklist.");
return "url in blacklist";
}
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
}
// filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
}
// deny cgi
if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
return "individual url (sessionid etc) not wanted";
}
// deny post properties
if (url.isPOST() && !profile.crawlingQ()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
return "post url not allowed";
}
// check if the url is double registered // check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
@ -451,13 +415,72 @@ public final class CrawlStacker {
final AtomicInteger dp = profile.getCount(url.getHost()); final AtomicInteger dp = profile.getCount(url.getHost());
if (dp != null && dp.get() >= maxAllowedPagesPerDomain) { if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "crawl stack domain counter exceeded"; return "crawl stack domain counter exceeded (test by profile)";
} }
/*
if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) { if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "result stack domain counter exceeded"; return "result stack domain counter exceeded (test by domainCount)";
} }
*/
}
return null;
}
/**
* Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl.
* @param url
* @param profile
* @param depth
* @return null if the url is accepted, an error string in case if the url is not accepted with an error description
*/
public String checkAcceptanceChangeable(final DigestURL url, final CrawlProfile profile, final int depth) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();
final String urlstring = url.toString();
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
this.log.fine("URL '" + urlstring + "' is in blacklist.");
return "url in blacklist";
}
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
}
// filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
}
// deny cgi
if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
return "individual url (sessionid etc) not wanted";
}
// deny post properties
if (url.isPOST() && !profile.crawlingQ()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
return "post url not allowed";
} }
// the following filters use a DNS lookup to check if the url matches with IP filter // the following filters use a DNS lookup to check if the url matches with IP filter
@ -498,7 +521,6 @@ public final class CrawlStacker {
return null; return null;
} }
/** /**
* Test a url if it can be used for crawling/indexing * Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global) * This mainly checks if the url is in the declared domain (local/global)

@ -80,8 +80,8 @@ public final class CrawlSwitchboard {
DEFAULT_PROFILES.add(CRAWL_PROFILE_SURROGATE); DEFAULT_PROFILES.add(CRAWL_PROFILE_SURROGATE);
} }
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap"; public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive1.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap"; public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -103,21 +103,23 @@ public final class CrawlSwitchboard {
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile; public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot; private final File queuesRoot;
private Switchboard switchboard;
public CrawlSwitchboard(final String networkName, final ConcurrentLog log, final File queuesRoot) { public CrawlSwitchboard(final String networkName, Switchboard switchboard) {
log.info("Initializing Word Index for the network '" + networkName + "'."); this.switchboard = switchboard;
this.log = this.switchboard.log;
this.queuesRoot = this.switchboard.queuesRoot;
this.log.info("Initializing Word Index for the network '" + networkName + "'.");
if ( networkName == null || networkName.isEmpty() ) { if ( networkName == null || networkName.isEmpty() ) {
log.severe("no network name given - shutting down"); log.severe("no network name given - shutting down");
System.exit(0); System.exit(0);
} }
this.log = log;
this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder)); this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));
this.profilesActiveCrawlsCounter = new ConcurrentHashMap<String, RowHandleSet>(); this.profilesActiveCrawlsCounter = new ConcurrentHashMap<String, RowHandleSet>();
// make crawl profiles database and default profiles // make crawl profiles database and default profiles
this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs(); this.queuesRoot.mkdirs();
this.log.config("Initializing Crawl Profiles"); this.log.config("Initializing Crawl Profiles");
@ -166,6 +168,23 @@ public final class CrawlSwitchboard {
/ 1024); / 1024);
} }
/**
* Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles.
* A profile that was discovered from the passive stack is automatically shifted back to the active stack.
* @param profileKey
* @return
*/
public CrawlProfile get(final byte[] profileKey) {
CrawlProfile profile = getActive(profileKey);
if (profile != null) return profile;
profile = getPassive(profileKey);
if (profile == null) return null;
// clean up
this.putActive(profileKey, profile);
this.removePassive(profileKey);
return profile;
}
public CrawlProfile getActive(final byte[] profileKey) { public CrawlProfile getActive(final byte[] profileKey) {
if ( profileKey == null ) { if ( profileKey == null ) {
return null; return null;
@ -237,10 +256,12 @@ public final class CrawlSwitchboard {
public void putActive(final byte[] profileKey, final CrawlProfile profile) { public void putActive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesActiveCrawls.put(profileKey, profile); this.profilesActiveCrawls.put(profileKey, profile);
this.profilesActiveCrawlsCache.put(profileKey, profile); this.profilesActiveCrawlsCache.put(profileKey, profile);
this.removePassive(profileKey);
} }
public void putPassive(final byte[] profileKey, final CrawlProfile profile) { public void putPassive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesPassiveCrawls.put(profileKey, profile); this.profilesPassiveCrawls.put(profileKey, profile);
this.removeActive(profileKey);
} }
public RowHandleSet getURLHashes(final byte[] profileKey) { public RowHandleSet getURLHashes(final byte[] profileKey) {
@ -534,7 +555,7 @@ public final class CrawlSwitchboard {
return hasDoneSomething; return hasDoneSomething;
} }
public int cleanFinishesProfiles(CrawlQueues crawlQueues) { public Set<String> getFinishesProfiles(CrawlQueues crawlQueues) {
// clear the counter cache // clear the counter cache
this.profilesActiveCrawlsCounter.clear(); this.profilesActiveCrawlsCounter.clear();
@ -547,7 +568,7 @@ public final class CrawlSwitchboard {
deletionCandidate.add(ASCII.String(handle)); deletionCandidate.add(ASCII.String(handle));
} }
} }
if (deletionCandidate.size() == 0) return 0; if (deletionCandidate.size() == 0) return new HashSet<String>(0);
// iterate through all the queues and see if one of these handles appear there // iterate through all the queues and see if one of these handles appear there
// this is a time-consuming process, set a time-out // this is a time-consuming process, set a time-out
@ -564,15 +585,24 @@ public final class CrawlSwitchboard {
if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);} if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);}
if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many
deletionCandidate.remove(handle); deletionCandidate.remove(handle);
if (deletionCandidate.size() == 0) return 0; if (deletionCandidate.size() == 0) return new HashSet<String>(0);
if (System.currentTimeMillis() > timeout) return 0; // give up; this is too large if (System.currentTimeMillis() > timeout) return new HashSet<String>(0); // give up; this is too large
} }
if (deletionCandidate.size() == 0) return 0; if (deletionCandidate.size() == 0) return new HashSet<String>(0);
}
// look into the CrawlQueues.worker as well
Request[] requests = switchboard.crawlQueues.activeWorkerEntries();
for (Request request: requests) {
deletionCandidate.remove(request.profileHandle());
} }
} catch (final Throwable e) { } catch (final Throwable e) {
return 0; ConcurrentLog.logException(e);
return new HashSet<String>(0);
} }
return deletionCandidate;
}
public void cleanProfiles(Set<String> deletionCandidate) {
// all entries that are left are candidates for deletion; do that now // all entries that are left are candidates for deletion; do that now
for (String h: deletionCandidate) { for (String h: deletionCandidate) {
byte[] handle = ASCII.getBytes(h); byte[] handle = ASCII.getBytes(h);
@ -582,7 +612,6 @@ public final class CrawlSwitchboard {
this.removeActive(handle); this.removeActive(handle);
} }
} }
return deletionCandidate.size();
} }
public synchronized void close() { public synchronized void close() {

@ -0,0 +1,256 @@
/**
* HostQueue
* Copyright 2013 by Michael Christen
* First released 24.09.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.table.Table;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
public class HostQueue {
public static final String indexSuffix = ".stack";
private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000;
private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
private final String hostHash;
private final File queuesPath;
private BufferedObjectIndex requestStack;
private HandleSet urlHashDoubleCheck;
public HostQueue(
final File queuesPath,
final String hostHash,
final boolean useTailCache,
final boolean exceed134217727) {
this.hostHash = hostHash;
this.queuesPath = queuesPath;
this.urlHashDoubleCheck = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
// create a stack for newly entered entries
if (!(this.queuesPath.exists())) this.queuesPath.mkdir(); // make the path
this.queuesPath.mkdirs();
final File f = new File(this.queuesPath, this.hostHash + indexSuffix);
try {
this.requestStack = new BufferedObjectIndex(new Table(f, Request.rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true), objectIndexBufferSize);
} catch (final SpaceExceededException e) {
try {
this.requestStack = new BufferedObjectIndex(new Table(f, Request.rowdef, 0, 0, false, exceed134217727, true), objectIndexBufferSize);
} catch (final SpaceExceededException e1) {
ConcurrentLog.logException(e1);
}
}
ConcurrentLog.info("Balancer", "opened balancer file with " + this.requestStack.size() + " entries from " + f.toString());
}
public synchronized void close() {
int sizeBeforeClose = this.size();
if (this.urlHashDoubleCheck != null) {
this.urlHashDoubleCheck.clear();
this.urlHashDoubleCheck = null;
}
if (this.requestStack != null) {
this.requestStack.close();
this.requestStack = null;
}
if (sizeBeforeClose == 0) {
// clean up
new File(this.queuesPath, this.hostHash + indexSuffix).delete();
}
}
public void clear() {
try {
this.requestStack.clear();
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
this.urlHashDoubleCheck.clear();
}
public Request get(final byte[] urlhash) throws IOException {
assert urlhash != null;
if (this.requestStack == null) return null; // case occurs during shutdown
final Row.Entry entry = this.requestStack.get(urlhash, false);
if (entry == null) return null;
return new Request(entry);
}
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
// first find a list of url hashes that shall be deleted
final HandleSet urlHashes = new RowHandleSet(this.requestStack.row().primaryKeyLength, Base64Order.enhancedCoder, 100);
final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
synchronized (this) {
final Iterator<Row.Entry> i = this.requestStack.rows();
Row.Entry rowEntry;
Request crawlEntry;
while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
rowEntry = i.next();
crawlEntry = new Request(rowEntry);
if (crawlEntry.profileHandle().equals(profileHandle)) {
urlHashes.put(crawlEntry.url().hash());
}
}
}
// then delete all these urls from the queues and the file index
return remove(urlHashes);
}
/**
* remove urls from the queue
* @param urlHashes, a list of hashes that shall be removed
* @return number of entries that had been removed
* @throws IOException
*/
public synchronized int remove(final HandleSet urlHashes) throws IOException {
final int s = this.requestStack.size();
int removedCounter = 0;
for (final byte[] urlhash: urlHashes) {
final Row.Entry entry = this.requestStack.remove(urlhash);
if (entry != null) removedCounter++;
// remove from double-check caches
this.urlHashDoubleCheck.remove(urlhash);
}
if (removedCounter == 0) return 0;
assert this.requestStack.size() + removedCounter == s : "urlFileIndex.size() = " + this.requestStack.size() + ", s = " + s;
return removedCounter;
}
public boolean has(final byte[] urlhashb) {
return this.requestStack.has(urlhashb) || this.urlHashDoubleCheck.has(urlhashb);
}
public int size() {
return this.requestStack.size();
}
public boolean isEmpty() {
return this.requestStack.isEmpty();
}
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
assert entry != null;
final byte[] hash = entry.url().hash();
synchronized (this) {
// double-check
if (this.urlHashDoubleCheck.has(hash)) return "double occurrence in double_push_check";
if (this.requestStack.has(hash)) return "double occurrence in urlFileIndex";
if (this.urlHashDoubleCheck.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.urlHashDoubleCheck.clear();
this.urlHashDoubleCheck.put(hash);
// increase dom counter
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
}
// add to index
final int s = this.requestStack.size();
this.requestStack.put(entry.toRow());
assert s < this.requestStack.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.requestStack.size();
assert this.requestStack.has(hash) : "hash = " + ASCII.String(hash);
// add the hash to a queue if the host is unknown to get this fast into the balancer
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
}
robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
return null;
}
public Request pop() throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
Request crawlEntry = null;
while (!this.requestStack.isEmpty()) {
synchronized (this) {
Row.Entry rowEntry = this.requestStack.removeOne();
if (rowEntry == null) return null;
crawlEntry = new Request(rowEntry);
// check blacklist (again) because the user may have created blacklist entries after the queue has been filled
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
ConcurrentLog.fine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist.");
continue;
}
break;
}
}
if (crawlEntry == null) return null;
return crawlEntry;
}
public Iterator<Request> iterator() throws IOException {
return new EntryIterator();
}
private class EntryIterator implements Iterator<Request> {
private Iterator<Row.Entry> rowIterator;
public EntryIterator() throws IOException {
this.rowIterator = HostQueue.this.requestStack.rows();
}
@Override
public boolean hasNext() {
return (this.rowIterator == null) ? false : this.rowIterator.hasNext();
}
@Override
public Request next() {
final Row.Entry entry = this.rowIterator.next();
try {
return (entry == null) ? null : new Request(entry);
} catch (final IOException e) {
ConcurrentLog.logException(e);
this.rowIterator = null;
return null;
}
}
@Override
public void remove() {
if (this.rowIterator != null) this.rowIterator.remove();
}
}
}

@ -0,0 +1,169 @@
/**
* HostQueues
* Copyright 2013 by Michael Christen
* First released 24.09.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
/**
* wrapper for single HostQueue queues; this is a collection of such queues.
* All these queues are stored in a common directory for the queue stacks
*/
public class HostQueues {
private final File queuesPath;
private final boolean useTailCache;
private final boolean exceed134217727;
private final Map<String, HostQueue> queues;
public HostQueues(
final File queuesPath,
final boolean useTailCache,
final boolean exceed134217727) {
this.queuesPath = queuesPath;
this.useTailCache = useTailCache;
this.exceed134217727 = exceed134217727;
// create a stack for newly entered entries
if (!(queuesPath.exists())) queuesPath.mkdir(); // make the path
this.queuesPath.mkdirs();
this.queues = new HashMap<String, HostQueue>();
String[] list = this.queuesPath.list();
for (String queuefile: list) {
if (queuefile.endsWith(HostQueue.indexSuffix)) {
String hosthash = queuefile.substring(0, queuefile.length() - HostQueue.indexSuffix.length());
HostQueue queue = new HostQueue(this.queuesPath, hosthash, this.useTailCache, this.exceed134217727);
this.queues.put(hosthash, queue);
}
}
}
public synchronized void close() {
for (HostQueue queue: this.queues.values()) queue.close();
this.queues.clear();
}
public void clear() {
for (HostQueue queue: this.queues.values()) queue.clear();
this.queues.clear();
}
public Request get(final byte[] urlhash) throws IOException {
String hosthash = ASCII.String(urlhash, 6, 6);
HostQueue queue = this.queues.get(hosthash);
if (queue == null) return null;
return queue.get(urlhash);
}
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
int c = 0;
for (HostQueue queue: this.queues.values()) c += queue.removeAllByProfileHandle(profileHandle, timeout);
return c;
}
public synchronized int remove(final HandleSet urlHashes) throws IOException {
Map<String, HandleSet> removeLists = new HashMap<String, HandleSet>();
for (byte[] urlhash: urlHashes) {
String hosthash = ASCII.String(urlhash, 6, 6);
HandleSet removeList = removeLists.get(hosthash);
if (removeList == null) {
removeList = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 100);
removeLists.put(hosthash, removeList);
}
try {removeList.put(urlhash);} catch (SpaceExceededException e) {}
}
int c = 0;
for (Map.Entry<String, HandleSet> entry: removeLists.entrySet()) {
HostQueue queue = this.queues.get(entry.getKey());
if (queue != null) c += queue.remove(entry.getValue());
}
return c;
}
public boolean has(final byte[] urlhashb) {
String hosthash = ASCII.String(urlhashb, 6, 6);
HostQueue queue = this.queues.get(hosthash);
if (queue == null) return false;
return queue.has(urlhashb);
}
public int size() {
int c = 0;
for (HostQueue queue: this.queues.values()) c += queue.size();
return c;
}
public boolean isEmpty() {
for (HostQueue queue: this.queues.values()) if (!queue.isEmpty()) return false;
return true;
}
/**
* push a request to one of the host queues. If the queue does not exist, it is created
* @param entry
* @param profile
* @param robots
* @return null if everything is ok or a string with an error message if the push is not allowed according to the crawl profile or robots
* @throws IOException
* @throws SpaceExceededException
*/
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
String hosthash = ASCII.String(entry.url().hash(), 6, 6);
HostQueue queue = this.queues.get(hosthash);
if (queue == null) {
queue = new HostQueue(this.queuesPath, hosthash, this.useTailCache, this.exceed134217727);
this.queues.put(hosthash, queue);
}
return queue.push(entry, profile, robots);
}
/**
* remove one request from all stacks except from those as listed in notFromHost
* @param notFromHost do not collect from these hosts
* @return a list of requests
* @throws IOException
*/
public List<Request> pop(Set<String> notFromHost) throws IOException {
ArrayList<Request> requests = new ArrayList<Request>();
for (Map.Entry<String, HostQueue> entry: this.queues.entrySet()) {
if (notFromHost.contains(entry.getKey())) continue;
Request r = entry.getValue().pop();
if (r != null) requests.add(r);
}
return requests;
}
}

@ -148,7 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
if (name.length() > 256) name = name.substring(256); if (name.length() > 256) name = name.substring(256);
this.doms = new ConcurrentHashMap<String, AtomicInteger>(); this.doms = new ConcurrentHashMap<String, AtomicInteger>();
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength); final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength);
put(HANDLE, handle); put(HANDLE, handle);
put(NAME, name); put(NAME, name);
put(AGENT_NAME, userAgentName); put(AGENT_NAME, userAgentName);

@ -255,7 +255,7 @@ public class CrawlQueues {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true; return true;
} }
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle)); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(profileHandle));
if (profile == null) { if (profile == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true; return true;
@ -269,14 +269,13 @@ public class CrawlQueues {
if (urlEntry == null) { if (urlEntry == null) {
continue; continue;
} }
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling: // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
if (profileHandle == null) { if (urlEntry.profileHandle() == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true; return true;
} }
load(urlEntry, stats, profileHandle); load(urlEntry, stats);
return true; return true;
} catch (final IOException e) { } catch (final IOException e) {
this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
@ -296,8 +295,8 @@ public class CrawlQueues {
* @param stats String for log prefixing * @param stats String for log prefixing
* @return * @return
*/ */
private void load(final Request urlEntry, final String stats, final String profileHandle) { private void load(final Request urlEntry, final String stats) {
final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle)); final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(urlEntry.profileHandle()));
if (profile != null) { if (profile != null) {
// check if the protocol is supported // check if the protocol is supported
@ -574,11 +573,7 @@ public class CrawlQueues {
try { try {
final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler, this.sb.robots); final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler, this.sb.robots);
if (urlEntry == null) return false; if (urlEntry == null) return false;
final String profileHandle = urlEntry.profileHandle(); load(urlEntry, stats);
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
// urlEntry.url());
load(urlEntry, stats, profileHandle);
return true; return true;
} catch (final IOException e) { } catch (final IOException e) {
this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
@ -606,7 +601,7 @@ public class CrawlQueues {
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED); this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode()); this.code = Integer.valueOf(entry.hashCode());
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle())); this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle()));
} }
private long age() { private long age() {

@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry; import net.yacy.crawler.robots.RobotsTxtEntry;
@ -262,6 +263,37 @@ public class Latency {
return s.toString(); return s.toString();
} }
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
public static final class Host { public static final class Host {
private AtomicLong timeacc; private AtomicLong timeacc;
private AtomicLong lastacc; private AtomicLong lastacc;

@ -101,7 +101,7 @@ public class FTPLoader {
// create new ftp client // create new ftp client
final FTPClient ftpClient = new FTPClient(); final FTPClient ftpClient = new FTPClient();
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
// get a connection // get a connection
if (openConnection(ftpClient, entryUrl)) { if (openConnection(ftpClient, entryUrl)) {
// test if the specified file is a directory // test if the specified file is a directory
@ -249,7 +249,7 @@ public class FTPLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response( final Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -264,7 +264,7 @@ public class FTPLoader {
final byte[] b = ftpClient.get(path); final byte[] b = ftpClient.get(path);
// create a response // create a response
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response( final Response response = new Response(
request, request,
requestHeader, requestHeader,

@ -83,7 +83,7 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader(200); ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -123,7 +123,7 @@ public class FileLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -140,7 +140,7 @@ public class FileLoader {
is.close(); is.close();
// create response with loaded content // create response with loaded content
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,

@ -589,7 +589,7 @@ public class Response {
// -if-modified-since in request // -if-modified-since in request
// if the page is fresh at the very moment we can index it // if the page is fresh at the very moment we can index it
final Date ifModifiedSince = this.requestHeader.ifModifiedSince(); final Date ifModifiedSince = this.ifModifiedSince();
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) { if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
// parse date // parse date
Date d = this.responseHeader.lastModified(); Date d = this.responseHeader.lastModified();

@ -101,7 +101,7 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader(200); ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -141,7 +141,7 @@ public class SMBLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -158,7 +158,7 @@ public class SMBLoader {
is.close(); is.close();
// create response with loaded content // create response with loaded content
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,

@ -575,7 +575,7 @@ public final class Protocol {
maximumRecords, maximumRecords,
verify, verify,
global, global,
null); ClientIdentification.yacyInternetCrawlerAgent);
} }
protected static int primarySearch( protected static int primarySearch(

@ -187,7 +187,7 @@ public final class LoaderDispatcher {
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
final String protocol = url.getProtocol(); final String protocol = url.getProtocol();
final String host = url.getHost(); final String host = url.getHost();
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle())); final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist // check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) { if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {

@ -536,7 +536,7 @@ public final class Switchboard extends serverSwitch {
} }
// create a crawler // create a crawler
this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot); this.crawler = new CrawlSwitchboard(networkName, this);
// start yacy core // start yacy core
this.log.config("Starting YaCy Protocol Core"); this.log.config("Starting YaCy Protocol Core");
@ -1330,7 +1330,7 @@ public final class Switchboard extends serverSwitch {
// create a crawler // create a crawler
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot); this.crawler = new CrawlSwitchboard(networkName, this);
// init a DHT transmission dispatcher // init a DHT transmission dispatcher
this.dhtDispatcher = this.dhtDispatcher =
@ -2009,7 +2009,7 @@ public final class Switchboard extends serverSwitch {
// clear caches // clear caches
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords(); if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
Word.clearCache(); Word.clearCache();
Domains.clear(); // Domains.clear();
// clean up image stack // clean up image stack
ResultImages.clearQueues(); ResultImages.clearQueues();
@ -2130,9 +2130,24 @@ public final class Switchboard extends serverSwitch {
// clean up profiles // clean up profiles
checkInterruption(); checkInterruption();
//cleanProfiles();
int cleanup = this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? 0 : this.crawler.cleanFinishesProfiles(this.crawlQueues); if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
if (cleanup > 0) log.info("cleanup removed " + cleanup + " crawl profiles"); Set<String> deletionCandidates = this.crawler.getFinishesProfiles(this.crawlQueues);
int cleanup = deletionCandidates.size();
if (cleanup > 0) {
// run postprocessing on these profiles
postprocessingRunning = true;
int proccount = 0;
for (String profileHash: deletionCandidates) {
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, profileHash);
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, profileHash);
}
postprocessingRunning = false;
this.crawler.cleanProfiles(deletionCandidates);
log.info("cleanup removed " + cleanup + " crawl profiles, post-processed " + proccount + " documents");
}
}
// clean up news // clean up news
checkInterruption(); checkInterruption();
@ -2268,11 +2283,14 @@ public final class Switchboard extends serverSwitch {
// if no crawl is running and processing is activated: // if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned // execute the (post-) processing steps for all entries that have a process tag assigned
if (this.crawlQueues.coreCrawlJobSize() == 0) { if (this.crawlQueues.coreCrawlJobSize() == 0) {
if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches if (this.crawlQueues.noticeURL.isEmpty()) {
Domains.clear();
this.crawlQueues.noticeURL.clear(); // flushes more caches
}
postprocessingRunning = true; postprocessingRunning = true;
int proccount = 0; int proccount = 0;
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index); proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, null);
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index); proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, null);
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess; long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess; long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun; long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
@ -2490,13 +2508,13 @@ public final class Switchboard extends serverSwitch {
if (response.profile() != null) { if (response.profile() != null) {
ArrayList<Document> newDocs = new ArrayList<Document>(); ArrayList<Document> newDocs = new ArrayList<Document>();
for (Document doc: documents) { for (Document doc: documents) {
String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/); String rejectReason = this.crawlStacker.checkAcceptanceChangeable(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
if (rejectReason == null) { if (rejectReason == null) {
newDocs.add(doc); newDocs.add(doc);
} else { } else {
// we consider this as fail urls to have a tracking of the problem // we consider this as fail urls to have a tracking of the problem
if (rejectReason != null && !rejectReason.startsWith("double in")) { if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle())); final CrawlProfile profile = this.crawler.get(UTF8.getBytes(response.profile().handle()));
this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
} }
} }
@ -2659,18 +2677,28 @@ public final class Switchboard extends serverSwitch {
// the condenser may be null in case that an indexing is not wanted (there may be a no-indexing flag in the file) // the condenser may be null in case that an indexing is not wanted (there may be a no-indexing flag in the file)
if ( in.condenser != null ) { if ( in.condenser != null ) {
for ( int i = 0; i < in.documents.length; i++ ) { for ( int i = 0; i < in.documents.length; i++ ) {
CrawlProfile profile = in.queueEntry.profile();
storeDocumentIndex( storeDocumentIndex(
in.queueEntry, in.queueEntry,
in.queueEntry.profile().collections(), in.queueEntry.profile().collections(),
in.documents[i], in.documents[i],
in.condenser[i], in.condenser[i],
null, null,
"crawler/indexing queue"); profile == null ? "crawler" : profile.handle());
} }
} }
in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED); in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED);
} }
/**
*
* @param queueEntry
* @param collections
* @param document
* @param condenser
* @param searchEvent
* @param sourceName if this document was created by a crawl, then the sourceName contains the crawl hash
*/
private void storeDocumentIndex( private void storeDocumentIndex(
final Response queueEntry, final Response queueEntry,
final Map<String, Pattern> collections, final Map<String, Pattern> collections,
@ -2821,7 +2849,7 @@ public final class Switchboard extends serverSwitch {
public void stackURLs(Set<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) { public void stackURLs(Set<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) {
if (rootURLs == null || rootURLs.size() == 0) return; if (rootURLs == null || rootURLs.size() == 0) return;
List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently final List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
for (DigestURL url: rootURLs) { for (DigestURL url: rootURLs) {
final DigestURL turl = url; final DigestURL turl = url;
Thread t = new Thread() { Thread t = new Thread() {
@ -2832,9 +2860,9 @@ public final class Switchboard extends serverSwitch {
}; };
t.start(); t.start();
stackthreads.add(t); stackthreads.add(t);
try {Thread.sleep(10);} catch (final InterruptedException e) {} // to prevent that this fires more than 100 connections pre second! try {Thread.sleep(100);} catch (final InterruptedException e) {} // to prevent that this fires more than 10 connections pre second!
} }
long waitingtime = 1 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {} for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}
} }
@ -2974,8 +3002,8 @@ public final class Switchboard extends serverSwitch {
continue; continue;
} }
final Request request = this.loader.request(e.getValue(), true, true); final Request request = this.loader.request(e.getValue(), true, true);
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0); final String acceptedError = this.crawlStacker.checkAcceptanceChangeable(e.getValue(), profile, 0);
if (acceptedError != null) { if (acceptedError != null) {
this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError); this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
continue; continue;
@ -3004,7 +3032,7 @@ public final class Switchboard extends serverSwitch {
final Document[] documents = response.parse(); final Document[] documents = response.parse();
if (documents != null) { if (documents != null) {
for (final Document document: documents) { for (final Document document: documents) {
final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle()));
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) { if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url); throw new Parser.Failure("indexing is denied", url);
} }
@ -3047,8 +3075,9 @@ public final class Switchboard extends serverSwitch {
if (existingids.contains(e.getKey())) continue; // double if (existingids.contains(e.getKey())) continue; // double
DigestURL url = e.getValue(); DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true); final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0); String acceptedError = this.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile);
if (acceptedError != null) { if (acceptedError != null) {
this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
return; return;

@ -599,7 +599,7 @@ public class Segment {
final Document document, final Document document,
final Condenser condenser, final Condenser condenser,
final SearchEvent searchEvent, final SearchEvent searchEvent,
final String sourceName, final String sourceName, // contains the crawl profile hash if this comes from a web crawl
final boolean storeToRWI final boolean storeToRWI
) { ) {
final long startTime = System.currentTimeMillis(); final long startTime = System.currentTimeMillis();
@ -619,7 +619,7 @@ public class Segment {
char docType = Response.docType(document.dc_format()); char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT // CREATE SOLR DOCUMENT
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration()); final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration(), sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION // ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) { if (this.connectedCitation()) {

@ -79,8 +79,16 @@ public final class QueryParams {
} }
} }
private static final CollectionSchema[] defaultfacetfields = new CollectionSchema[]{ private static final Map<String, CollectionSchema> defaultfacetfields = new HashMap<String, CollectionSchema>();
CollectionSchema.host_s, CollectionSchema.url_protocol_s, CollectionSchema.url_file_ext_s, CollectionSchema.author_sxt}; static {
// the key shall match with configuration property search.navigation
defaultfacetfields.put("location", CollectionSchema.coordinate_p);
defaultfacetfields.put("hosts", CollectionSchema.host_s);
defaultfacetfields.put("protocol", CollectionSchema.url_protocol_s);
defaultfacetfields.put("filetype", CollectionSchema.url_file_ext_s);
defaultfacetfields.put("authors", CollectionSchema.author_sxt);
//missing: namespace
}
private static final int defaultmaxfacets = 30; private static final int defaultmaxfacets = 30;
private static final String ampersand = "&amp;"; private static final String ampersand = "&amp;";
@ -132,7 +140,8 @@ public final class QueryParams {
final Bitfield constraint, final Bitfield constraint,
final Segment indexSegment, final Segment indexSegment,
final RankingProfile ranking, final RankingProfile ranking,
final String userAgent) { final String userAgent,
final String[] search_navigation) {
this.queryGoal = new QueryGoal(query_original, query_words); this.queryGoal = new QueryGoal(query_original, query_words);
this.ranking = ranking; this.ranking = ranking;
this.modifier = new QueryModifier(); this.modifier = new QueryModifier();
@ -169,8 +178,9 @@ public final class QueryParams {
this.facetfields = new LinkedHashSet<String>(); this.facetfields = new LinkedHashSet<String>();
this.solrSchema = indexSegment.fulltext().getDefaultConfiguration(); this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
for (CollectionSchema f: defaultfacetfields) { for (String navkey: search_navigation) {
if (solrSchema.contains(f)) facetfields.add(f.getSolrFieldName()); CollectionSchema f = defaultfacetfields.get(navkey);
if (f != null && solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
} }
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
this.maxfacets = defaultmaxfacets; this.maxfacets = defaultmaxfacets;
@ -205,7 +215,8 @@ public final class QueryParams {
final boolean filterscannerfail, final boolean filterscannerfail,
final double lat, final double lat,
final double lon, final double lon,
final double radius final double radius,
final String[] search_navigation
) { ) {
this.queryGoal = queryGoal; this.queryGoal = queryGoal;
this.modifier = modifier; this.modifier = modifier;
@ -269,8 +280,9 @@ public final class QueryParams {
this.facetfields = new LinkedHashSet<String>(); this.facetfields = new LinkedHashSet<String>();
this.solrSchema = indexSegment.fulltext().getDefaultConfiguration(); this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
for (CollectionSchema f: defaultfacetfields) { for (String navkey: search_navigation) {
if (solrSchema.contains(f)) facetfields.add(f.getSolrFieldName()); CollectionSchema f = defaultfacetfields.get(navkey);
if (f != null && solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
} }
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
this.maxfacets = defaultmaxfacets; this.maxfacets = defaultmaxfacets;

@ -136,6 +136,7 @@ public final class SearchEvent {
private Thread localsolrsearch; private Thread localsolrsearch;
private int localsolroffset; private int localsolroffset;
private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons
public final ScoreMap<String> locationNavigator; // a counter for the appearance of location coordinates
public final ScoreMap<String> hostNavigator; // a counter for the appearance of host names public final ScoreMap<String> hostNavigator; // a counter for the appearance of host names
public final ScoreMap<String> authorNavigator; // a counter for the appearances of authors public final ScoreMap<String> authorNavigator; // a counter for the appearances of authors
public final ScoreMap<String> namespaceNavigator; // a counter for name spaces public final ScoreMap<String> namespaceNavigator; // a counter for name spaces
@ -225,6 +226,7 @@ public final class SearchEvent {
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true); this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
// prepare configured search navigation // prepare configured search navigation
final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", ""); final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", "");
this.locationNavigator = navcfg.contains("location") ? new ConcurrentScoreMap<String>() : null;
this.authorNavigator = navcfg.contains("authors") ? new ConcurrentScoreMap<String>() : null; this.authorNavigator = navcfg.contains("authors") ? new ConcurrentScoreMap<String>() : null;
this.namespaceNavigator = navcfg.contains("namespace") ? new ConcurrentScoreMap<String>() : null; this.namespaceNavigator = navcfg.contains("namespace") ? new ConcurrentScoreMap<String>() : null;
this.hostNavigator = navcfg.contains("hosts") ? new ConcurrentScoreMap<String>() : null; this.hostNavigator = navcfg.contains("hosts") ? new ConcurrentScoreMap<String>() : null;
@ -741,6 +743,17 @@ public final class SearchEvent {
// collect navigation information // collect navigation information
ReversibleScoreMap<String> fcts; ReversibleScoreMap<String> fcts;
if (this.locationNavigator != null) {
fcts = facets.get(CollectionSchema.coordinate_p.getSolrFieldName());
if (fcts != null) {
for (String coordinate: fcts) {
int hc = fcts.get(coordinate);
if (hc == 0) continue;
this.locationNavigator.inc(coordinate, hc);
}
}
}
if (this.hostNavigator != null) { if (this.hostNavigator != null) {
fcts = facets.get(CollectionSchema.host_s.getSolrFieldName()); fcts = facets.get(CollectionSchema.host_s.getSolrFieldName());
if (fcts != null) { if (fcts != null) {

@ -84,8 +84,8 @@ import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.WebgraphConfiguration.Subgraph; import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable { public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@ -169,52 +169,33 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName()); omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
} }
/**
* Convert a SolrDocument to a SolrInputDocument.
* This is useful if a document from the search index shall be modified and indexed again.
* This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process.
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(final SolrDocument doc) { public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
SolrInputDocument sid = new SolrInputDocument(); return toSolrInputDocument(doc, omitFields);
for (String name: doc.getFieldNames()) {
if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
}
}
return sid;
} }
public SolrDocument toSolrDocument(final SolrInputDocument doc) { public SolrDocument toSolrDocument(final SolrInputDocument doc) {
SolrDocument sd = new SolrDocument(); return toSolrDocument(doc, omitFields);
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema
sd.setField(field.getName(), field.getValue());
}
}
return sd;
} }
/** /**
* add uri attributes to solr document * add uri attributes to solr document
* @param doc * @param doc
* @param allAttr * @param allAttr
* @param digestURI * @param digestURL
* @param doctype * @param doctype
* @return the normalized url * @return the normalized url
*/ */
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURI, final char doctype) { public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL, final char doctype) {
add(doc, CollectionSchema.id, ASCII.String(digestURI.hash())); add(doc, CollectionSchema.id, ASCII.String(digestURL.hash()));
String us = digestURI.toNormalform(true); if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash());
String us = digestURL.toNormalform(true);
add(doc, CollectionSchema.sku, us); add(doc, CollectionSchema.sku, us);
if (allAttr || contains(CollectionSchema.ip_s)) { if (allAttr || contains(CollectionSchema.ip_s)) {
final InetAddress address = digestURI.getInetAddress(); final InetAddress address = digestURL.getInetAddress();
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress()); if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
} }
String host = null; String host = null;
if ((host = digestURI.getHost()) != null) { if ((host = digestURL.getHost()) != null) {
String dnc = Domains.getDNC(host); String dnc = Domains.getDNC(host);
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1); String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
int p = subdomOrga.lastIndexOf('.'); int p = subdomOrga.lastIndexOf('.');
@ -228,17 +209,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
// path elements of link // path elements of link
String filename = digestURI.getFileName(); String filename = digestURL.getFileName();
String extension = MultiProtocolURL.getFileExtension(filename); String extension = MultiProtocolURL.getFileExtension(filename);
if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length()); if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length());
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol()); if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol());
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths()); if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURL.getPaths());
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename); if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension); if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype)); if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype));
Map<String, String> searchpart = digestURI.getSearchpartMap(); Map<String, String> searchpart = digestURL.getSearchpartMap();
if (searchpart == null) { if (searchpart == null) {
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0); if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0);
} else { } else {
@ -309,7 +290,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// fields that are in URIMetadataRow additional to yacy2solr basic requirement // fields that are in URIMetadataRow additional to yacy2solr basic requirement
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate()); if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate());
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate()); if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate());
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash())); if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash()));
if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5()); if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5());
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher()); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
@ -357,27 +337,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
public SolrVector yacy2solr( public SolrVector yacy2solr(
final String id, final Map<String, Pattern> collections, final ResponseHeader responseHeader, final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
final IndexCell<CitationReference> citations, final IndexCell<CitationReference> citations,
final WebgraphConfiguration webgraph) { final WebgraphConfiguration webgraph, final String sourceName) {
// we use the SolrCell design as index schema // we use the SolrCell design as index schema
SolrVector doc = new SolrVector(); SolrVector doc = new SolrVector();
final DigestURL digestURI = document.dc_source(); final DigestURL digestURL = document.dc_source();
final String id = ASCII.String(digestURL.hash());
boolean allAttr = this.isEmpty(); boolean allAttr = this.isEmpty();
String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI)); String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>(); Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
add(doc, CollectionSchema.id, id); String us = digestURL.toNormalform(true);
String us = digestURI.toNormalform(true);
int clickdepth = 999; int clickdepth = 999;
if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) { if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
if (digestURI.probablyRootURL()) { if (digestURL.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
clickdepth = 0; clickdepth = 0;
this.lazy = lc;
} else { } else {
clickdepth = 999; clickdepth = 999;
} }
@ -693,7 +671,23 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// canonical tag // canonical tag
if (allAttr || contains(CollectionSchema.canonical_s)) { if (allAttr || contains(CollectionSchema.canonical_s)) {
final DigestURL canonical = html.getCanonical(); DigestURL canonical = html.getCanonical();
// if there is no canonical in the html then look into the http header:
if (canonical == null) {
String link = responseHeader.get("Link", null);
int p;
if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) {
link = link.substring(0, p).trim();
p = link.indexOf('<');
int q = link.lastIndexOf('>');
if (p >= 0 && q > 0) {
link = link.substring(p + 1, q);
try {
canonical = new DigestURL(link);
} catch (MalformedURLException e) {}
}
}
}
if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) { if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
containsCanonical = true; containsCanonical = true;
inboundLinks.remove(canonical); inboundLinks.remove(canonical);
@ -712,7 +706,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (refresh != null && refresh.length() > 0) { if (refresh != null && refresh.length() > 0) {
MultiProtocolURL refreshURL; MultiProtocolURL refreshURL;
try { try {
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURI, html.getRefreshPath()); refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath());
if (refreshURL != null) { if (refreshURL != null) {
inboundLinks.remove(refreshURL); inboundLinks.remove(refreshURL);
outboundLinks.remove(refreshURL); outboundLinks.remove(refreshURL);
@ -785,7 +779,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
String content = document.getTextString(); String content = document.getTextString();
String tokens = digestURI.toTokens(); String tokens = digestURL.toTokens();
if (content == null || content.length() == 0) { if (content == null || content.length() == 0) {
content = tokens; content = tokens;
} else { } else {
@ -798,9 +792,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
} }
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURI.getFileName()))) { if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
content = digestURI.toTokens(); // remove all other entry but the url tokens content = digestURL.toTokens(); // remove all other entry but the url tokens
} }
// content (must be written after special parser data, since this can influence the content) // content (must be written after special parser data, since this can influence the content)
@ -824,7 +818,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// create a subgraph // create a subgraph
if (!containsCanonical) { if (!containsCanonical) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations); webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations, sourceName);
} }
// list all links // list all links
@ -850,7 +844,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength()); int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength());
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate); if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate);
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash())); if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash()));
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]); //if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher()); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher());
@ -874,6 +867,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
List<String> p = new ArrayList<String>(); List<String> p = new ArrayList<String>();
for (ProcessType t: processTypes) p.add(t.name()); for (ProcessType t: processTypes) p.add(t.name());
add(doc, CollectionSchema.process_sxt, p); add(doc, CollectionSchema.process_sxt, p);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(doc, CollectionSchema.harvestkey_s, sourceName);
}
} }
return doc; return doc;
} }
@ -885,16 +881,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param urlCitation * @param urlCitation
* @return * @return
*/ */
public int postprocessing(final Segment segment) { public int postprocessing(final Segment segment, String harvestkey) {
if (!this.contains(CollectionSchema.process_sxt)) return 0; if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation()) return 0; if (!segment.connectedCitation()) return 0;
SolrConnector connector = segment.fulltext().getDefaultConnector(); SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
connector.commit(true); // make sure that we have latest information that can be found SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
collectionConnector.commit(true); // make sure that we have latest information that can be found
ReferenceReportCache rrCache = segment.getReferenceReportCache(); ReferenceReportCache rrCache = segment.getReferenceReportCache();
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder); Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
ReversibleScoreMap<String> hostscore = null;
try { try {
// collect hosts from index which shall take part in citation computation // collect hosts from index which shall take part in citation computation
ReversibleScoreMap<String> hostscore = connector.getFacets(CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(), 10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); hostscore = collectionConnector.getFacets(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>(); if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
// for each host, do a citation rank computation // for each host, do a citation rank computation
for (String host: hostscore.keyList(true)) { for (String host: hostscore.keyList(true)) {
@ -912,11 +913,49 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
ranking.putAll(crn); // accumulate this here for usage in document update later ranking.putAll(crn); // accumulate this here for usage in document update later
} }
} catch (final IOException e2) { } catch (final IOException e2) {
hostscore = new ClusteredScoreMap<String>();
} }
// process all documents // process all documents at the webgraph for the outgoing links of this document
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
SolrDocument doc; SolrDocument doc;
if (webgraphConnector != null) {
for (String host: hostscore.keyList(true)) {
if (hostscore.get(host) <= 0) continue;
// select all webgraph edges and modify their cr value
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
0, 10000000, 60000, 50);
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
boolean changed = false;
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
}
id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
}
if (changed) try {
webgraphConnector.add(sid);
} catch (SolrException e) {
} catch (IOException e) {
}
}
} catch (final InterruptedException e) {}
}
}
// process all documents in collection
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
0, 10000, 60000, 50);
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
Set<String> uniqueURLs = new HashSet<String>(); Set<String> uniqueURLs = new HashSet<String>();
@ -964,12 +1003,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++;
// all processing steps checked, remove the processing tag // all processing steps checked, remove the processing and harvesting key
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
// send back to index // send back to index
//connector.deleteById(ASCII.String(id)); //connector.deleteById(ASCII.String(id));
connector.add(sid); collectionConnector.add(sid);
proccount++; proccount++;
} catch (final Throwable e1) { } catch (final Throwable e1) {
} }
@ -1269,6 +1310,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
configuration.add(doc, CollectionSchema.collection_sxt, cs); configuration.add(doc, CollectionSchema.collection_sxt, cs);
} }
// clickdepth, cr and postprocessing
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) {
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index
}
if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) {
processTypes.add(ProcessType.CITATION); // postprocessing needed
}
if (allAttr || configuration.contains(CollectionSchema.process_sxt)) {
List<String> p = new ArrayList<String>();
for (ProcessType t: processTypes) p.add(t.name());
configuration.add(doc, CollectionSchema.process_sxt, p);
}
return doc; return doc;
} }

@ -59,6 +59,7 @@ public enum CollectionSchema implements SchemaDeclaration {
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"), clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"), process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// optional but recommended, part of index distribution // optional but recommended, part of index distribution
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"), load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
@ -231,6 +232,23 @@ public enum CollectionSchema implements SchemaDeclaration {
this.omitNorms = omitNorms; this.omitNorms = omitNorms;
this.searchable = searchable; this.searchable = searchable;
this.comment = comment; this.comment = comment;
// verify our naming scheme
String name = this.name();
int p = name.indexOf('_');
if (p > 0) {
String ext = name.substring(p + 1);
assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name;
assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name;
assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name;
assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;
assert !ext.equals("val") || (type == SolrType.num_integer && multiValued) : name;
assert !ext.equals("d") || (type == SolrType.num_double && !multiValued) : name;
}
assert type.appropriateName(this) : "bad configuration: " + this.name(); assert type.appropriateName(this) : "bad configuration: " + this.name();
} }

@ -117,7 +117,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final Subgraph subgraph, final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source, final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links, final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
final IndexCell<CitationReference> citations) { final IndexCell<CitationReference> citations, final String sourceName) {
boolean allAttr = this.isEmpty(); boolean allAttr = this.isEmpty();
int target_order = 0; int target_order = 0;
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0; boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
@ -284,6 +284,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
List<String> pr = new ArrayList<String>(); List<String> pr = new ArrayList<String>();
for (ProcessType t: processTypes) pr.add(t.name()); for (ProcessType t: processTypes) pr.add(t.name());
add(edge, WebgraphSchema.process_sxt, pr); add(edge, WebgraphSchema.process_sxt, pr);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(edge, CollectionSchema.harvestkey_s, sourceName);
}
} }
// add the edge to the subgraph // add the edge to the subgraph
@ -291,7 +294,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
} }
} }
public int postprocessing(Segment segment) { public int postprocessing(final Segment segment, final String harvestkey) {
if (!this.contains(WebgraphSchema.process_sxt)) return 0; if (!this.contains(WebgraphSchema.process_sxt)) return 0;
if (!segment.connectedCitation()) return 0; if (!segment.connectedCitation()) return 0;
if (!segment.fulltext().writeToWebgraph()) return 0; if (!segment.fulltext().writeToWebgraph()) return 0;
@ -299,7 +302,10 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
// that means we must search for those entries. // that means we must search for those entries.
connector.commit(true); // make sure that we have latest information that can be found connector.commit(true); // make sure that we have latest information that can be found
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10); //BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 100000, 60000, 50); BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
0, 100000, 60000, 50);
SolrDocument doc; SolrDocument doc;
String protocol, urlstub, id; String protocol, urlstub, id;
@ -335,6 +341,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
// all processing steps checked, remove the processing tag // all processing steps checked, remove the processing tag
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName()); sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
// send back to index // send back to index
connector.add(sid); connector.add(sid);

@ -36,6 +36,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"), load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"), collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."), process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// source information // source information
source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"), source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),
@ -51,6 +52,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"), source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"), source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"), source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),
source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"), source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"), source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"),
@ -85,6 +87,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"), target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"), target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"), target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),
target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"), target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"), target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),
@ -114,6 +117,23 @@ public enum WebgraphSchema implements SchemaDeclaration {
this.omitNorms = omitNorms; this.omitNorms = omitNorms;
this.searchable = searchable; this.searchable = searchable;
this.comment = comment; this.comment = comment;
// verify our naming scheme
String name = this.name();
int p = name.indexOf('_');
if (p > 0) {
String ext = name.substring(p + 1);
assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name;
assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name;
assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name;
assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;
assert !ext.equals("val") || (type == SolrType.num_integer && multiValued) : name;
assert !ext.equals("d") || (type == SolrType.num_double && !multiValued) : name;
}
assert type.appropriateName(this) : "bad configuration: " + this.name(); assert type.appropriateName(this) : "bad configuration: " + this.name();
} }

@ -391,7 +391,6 @@ public final class TemplateEngine {
// #% // #%
} else if ((bb & 0xFF) == pcChar) { //include } else if ((bb & 0xFF) == pcChar) { //include
final ByteBuffer include = new ByteBuffer();
keyStream.reset(); //reset stream keyStream.reset(); //reset stream
if(transferUntil(pis, keyStream, iClose)){ if(transferUntil(pis, keyStream, iClose)){
byte[] filename = keyStream.toByteArray(); byte[] filename = keyStream.toByteArray();
@ -403,6 +402,7 @@ public final class TemplateEngine {
filename= replacePattern(patternkey, pattern, dflt); filename= replacePattern(patternkey, pattern, dflt);
} }
if (filename.length > 0 && !java.util.Arrays.equals(filename, dflt)) { if (filename.length > 0 && !java.util.Arrays.equals(filename, dflt)) {
final ByteBuffer include = new ByteBuffer();
BufferedReader br = null; BufferedReader br = null;
try{ try{
//br = new BufferedReader(new InputStreamReader(new FileInputStream( filename ))); //Simple Include //br = new BufferedReader(new InputStreamReader(new FileInputStream( filename ))); //Simple Include
@ -422,9 +422,9 @@ public final class TemplateEngine {
structure.append(ASCII.getBytes("<fileinclude file=\"")).append(filename).append(close_tagn); structure.append(ASCII.getBytes("<fileinclude file=\"")).append(filename).append(close_tagn);
structure.append(writeTemplate(pis2, out, pattern, dflt, new byte[0])); //clear pattern prefix for include structure.append(writeTemplate(pis2, out, pattern, dflt, new byte[0])); //clear pattern prefix for include
structure.append(ASCII.getBytes("</fileinclude>\n")); structure.append(ASCII.getBytes("</fileinclude>\n"));
include.close();
} }
} }
// # - no special character. This is simply a '#' without meaning // # - no special character. This is simply a '#' without meaning
} else { //no match, but a single hash (output # + bb) } else { //no match, but a single hash (output # + bb)
out.write(hashChar); out.write(hashChar);

@ -63,7 +63,7 @@ public class serverSwitch
public final File dataPath; public final File dataPath;
public final File appPath; public final File appPath;
protected boolean firstInit; protected boolean firstInit;
protected ConcurrentLog log; public ConcurrentLog log;
protected int serverJobs; protected int serverJobs;
private ConcurrentMap<String, String> configProps; private ConcurrentMap<String, String> configProps;
private final ConcurrentMap<String, String> configRemoved; private final ConcurrentMap<String, String> configRemoved;

Loading…
Cancel
Save