Added federated index storage to solr.

YaCy supports now the storage to remote solr indexes.
More federated storage (and search) methods may follow.

The remote index scheme is the same as produced by the SolrCell; see
http://wiki.apache.org/solr/ExtractingRequestHandler
Because this default scheme is used, the default example scheme can be used as solr configuration
This is also the same scheme that solr uses if documents are imported with apache tika.

federated solr storage is switched off by default.

To use this, do the following:
- set federated.service.solr.indexing.enabled = true
- download solr from http://www.apache.org/dyn/closer.cgi/lucene/solr/
- extract the solr (3.1) package, 'cd example' and start solr with 'java -jar start.jar'
- start yacy and then start a crawler. The crawler will fill both, YaCy and solr indexes.
- to check whats in solr after indexing, open http://localhost:8983/solr/admin/

Until now it is not possible to use the solr index to search with YaCy in that solr index.
This functionality is now available for two reasons:
1) to compare the functionality of Solr and YaCy and to compare the search speed
2) to use YaCy as a search appliance for people who need a crawler or other source harvesting methods
   that YaCy provides (like dublin core reading, wikimedia dump reading, rss feed reader etc) if people still
   want to use solr instead of YaCy.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7654 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent c17d102bd8
commit 19fd13d3bc

@ -41,6 +41,9 @@
<classpathentry kind="lib" path="lib/pdfbox-1.2.1.jar"/>
<classpathentry kind="lib" path="lib/commons-fileupload-1.2.2.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.16.jar"/>
<classpathentry kind="lib" path="lib/slf4j-api-1.5.5.jar"/>
<classpathentry kind="lib" path="lib/slf4j-jdk14-1.5.5.jar"/>
<classpathentry kind="lib" path="lib/apache-solr-solrj-3.1.0.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="lib" path="lib/icu4j-core.jar"/>
<classpathentry kind="output" path="gen"/>

@ -37,6 +37,9 @@
<key>ClassPath</key>
<array>
<string>$JAVAROOT/htroot</string>
<string>$JAVAROOT/lib/activation.jar</string>
<string>$JAVAROOT/lib/apache-mime4j-0.6.jar</string>
<string>$JAVAROOT/lib/apache-solr-solrj-3.1.0.jar</string>
<string>$JAVAROOT/lib/bcmail-jdk15-145.jar</string>
<string>$JAVAROOT/lib/bcprov-jdk15-145.jar</string>
<string>$JAVAROOT/lib/bzip2.jar</string>
@ -46,13 +49,11 @@
<string>$JAVAROOT/lib/commons-jxpath-1.3.jar</string>
<string>$JAVAROOT/lib/commons-logging-1.1.1.jar</string>
<string>$JAVAROOT/lib/fontbox-1.2.1.jar</string>
<string>$JAVAROOT/lib/activation.jar</string>
<string>$JAVAROOT/lib/apache-mime4j-0.6.jar</string>
<string>$JAVAROOT/lib/J7Zip-modified.jar</string>
<string>$JAVAROOT/lib/httpclient-4.1.jar</string>
<string>$JAVAROOT/lib/httpcore-4.1.jar</string>
<string>$JAVAROOT/lib/httpmime-4.1.jar</string>
<string>$JAVAROOT/lib/icu4j-core.jar</string>
<string>$JAVAROOT/lib/J7Zip-modified.jar</string>
<string>$JAVAROOT/lib/jakarta-oro-2.0.8.jar</string>
<string>$JAVAROOT/lib/jcifs-1.3.15.jar</string>
<string>$JAVAROOT/lib/jsch-0.1.42.jar</string>
@ -63,6 +64,8 @@
<string>$JAVAROOT/lib/poi-3.6-20091214.jar</string>
<string>$JAVAROOT/lib/poi-scratchpad-3.6-20091214.jar</string>
<string>$JAVAROOT/lib/servlet-api.jar</string>
<string>$JAVAROOT/lib/slf4j-api-1.5.5.jar</string>
<string>$JAVAROOT/lib/slf4j-jdk14-1.5.5.jar</string>
<string>$JAVAROOT/lib/webcat-0.1-swf.jar</string>
<string>$JAVAROOT/lib/xerces.jar</string>
<string>$JAVAROOT/lib/yacycore.jar</string>

@ -181,6 +181,7 @@
<pathelement location="${htroot}" />
<pathelement location="${lib}/activation.jar" />
<pathelement location="${lib}/apache-mime4j-0.6.jar" />
<pathelement location="${lib}/apache-solr-solrj-3.1.0.jar" />
<pathelement location="${lib}/bcmail-jdk15-145.jar" />
<pathelement location="${lib}/bcprov-jdk15-145.jar" />
<pathelement location="${lib}/bzip2.jar" />
@ -205,6 +206,8 @@
<pathelement location="${lib}/poi-3.6-20091214.jar" />
<pathelement location="${lib}/poi-scratchpad-3.6-20091214.jar" />
<pathelement location="${lib}/servlet-api.jar" />
<pathelement location="${lib}/slf4j-api-1.5.5.jar" />
<pathelement location="${lib}/slf4j-jdk14-1.5.5.jar" />
<pathelement location="${lib}/webcat-0.1-swf.jar" />
<pathelement location="${lib}/xerces.jar" />
</path>

@ -989,3 +989,23 @@ color_signother = #000099
color_searchheadline = #2200CC
color_searchurl = #008000
color_searchurlhover = #008000
# federated index storage and federated search functionality
# federated search means that other search engines may be used together with the built-in indexing.
# each federated search may be able to be used as remote indexing service and/or as remote search service.
# a typical use case for a federated search is a concurrent search from opensearch sources.
# a typical use case for a remote indexing service is a remote solr index. YaCy supports remote solr indexes.
# solr indexes can be filled if enabled is set to true
# the remote index scheme is the same as produced by the SolrCell; see http://wiki.apache.org/solr/ExtractingRequestHandler
# because this default scheme is used the default example scheme can be used as solr configuration
# to use this, do the following:
# - set federated.service.solr.indexing.enabled = true
# - download solr from http://www.apache.org/dyn/closer.cgi/lucene/solr/
# - extract the solr (3.1) package, 'cd example' and start solr with 'java -jar start.jar'
# - start yacy and then start a crawler. The crawler will fill both, YaCy and solr indexes.
# - to check whats in solr after indexing, open http://localhost:8983/solr/admin/
federated.service.solr.indexing.enabled = false
federated.service.solr.indexing.url = http://127.0.0.1:8983/solr
federated.service.solr.indexing.scheme = SolrCell

@ -58,9 +58,9 @@ public class Table_YMark_p {
// get available tags and folders
count = 0;
/*
byte[] key;
String name;
/*
try {
Iterator<byte[]> iter = sb.tables.keys(YMarkTables.TABLES.TAGS.tablename(bmk_user));
while(iter.hasNext()) {

@ -22,7 +22,7 @@ public class get_ymark {
sb = (Switchboard) env;
prop = new serverObjects();
boolean tags = false;
//boolean tags = false;
final UserDB.Entry user = sb.userDB.getUser(header);
final boolean isAdmin = (sb.verifyAuthentication(header, true));
@ -33,7 +33,7 @@ public class get_ymark {
final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN);
if(post.containsKey(YMarkTables.BOOKMARK.TAGS.key())) {
tags = true;
//tags = true;
final String[] tagArray = YMarkUtil.cleanTagsString(post.get(YMarkTables.BOOKMARK.TAGS.key())).split(YMarkUtil.TAGS_SEPARATOR);
try {
bookmarks = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagArray);

@ -26,7 +26,7 @@ import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.SearchSRURSS;
import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.document.LibraryProvider;
import net.yacy.document.geolocalization.Location;
import de.anomic.crawler.CrawlProfile;
@ -94,7 +94,7 @@ public class yacysearch_location {
// get a queue of search results
String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
SearchSRURSS.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, CrawlProfile.CacheStrategy.NOCACHE, false, null);
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, CrawlProfile.CacheStrategy.NOCACHE, false, null);
// take the results and compute some locations
RSSMessage message;

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

@ -0,0 +1,176 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

@ -0,0 +1,21 @@
Copyright (c) 2004-2008 QOS.ch
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Binary file not shown.

Binary file not shown.

@ -62,19 +62,21 @@ public class Balancer {
private static final int objectIndexBufferSize = 1000;
private static final String localhost = "localhost";
// class variables
// class variables filled with external values
private final File cacheStacksPath;
private long minimumLocalDelta;
private long minimumGlobalDelta;
private final Set<String> myAgentIDs;
private BufferedObjectIndex urlFileIndex;
// class variables computed during operation
private final ConcurrentMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs
private final ConcurrentLinkedQueue<byte[]> top; // a list of url-hashes that shall be taken next
private final SortedMap<Long, byte[]> delayed;
private final HandleSet ddc;
private final HandleSet double_push_check; // for debugging
private BufferedObjectIndex urlFileIndex;
private final File cacheStacksPath;
private long minimumLocalDelta;
private long minimumGlobalDelta;
private long lastDomainStackFill;
private int domStackInitSize;
private Set<String> myAgentIDs;
public Balancer(
final File cachePath,

@ -50,6 +50,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
public static final String PUSH_SOLR = "pushSolr";
public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache";
@ -202,6 +203,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean pushSolr() {
final String r = get(PUSH_SOLR);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexText() {
final String r = get(INDEX_TEXT);
if (r == null) return true;

@ -87,6 +87,8 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.protocol.http.ProxySettings;
import net.yacy.cora.services.federated.solr.SolrScheme;
import net.yacy.cora.services.federated.solr.SolrSingleConnector;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
@ -238,6 +240,7 @@ public final class Switchboard extends serverSwitch {
private final Semaphore shutdownSync = new Semaphore(0);
private boolean terminate = false;
private SolrSingleConnector solrConnector = null;
//private Object crawlingPausedSync = new Object();
//private boolean crawlingIsPaused = false;
@ -581,6 +584,11 @@ public final class Switchboard extends serverSwitch {
}
}
// set up the solr interface
String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0;
this.solrConnector = (usesolr) ? new SolrSingleConnector("http://127.0.0.1:8983/solr", SolrScheme.SolrCell) : null;
// initializing dht chunk generation
this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50);
@ -1874,6 +1882,23 @@ public final class Switchboard extends serverSwitch {
public indexingQueueEntry condenseDocument(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
if (this.solrConnector != null /*in.queueEntry.profile().pushSolr()*/) {
// send the documents to solr
for (Document doc: in.documents) {
try {
String id = UTF8.String(new DigestURI(doc.dc_identifier(), null).hash());
assert id.equals(UTF8.String(in.queueEntry.url().hash()));
try {
this.solrConnector.add(id, doc);
} catch (IOException e) {
Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
}
} catch (MalformedURLException e) {
Log.logException(e);
continue;
}
}
}
if (!in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia()) {
if (log.isInfo()) {
log.logInfo("Not Condensed Resource '" + in.queueEntry.url().toNormalform(false, true) + "': indexing not wanted by crawl profile");

@ -63,7 +63,7 @@ import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.http.HTTPConnector;
import net.yacy.cora.services.SearchSRURSS;
import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -373,7 +373,7 @@ public final class yacyClient {
public static RSSFeed search(final yacySeed targetSeed, String query, CrawlProfile.CacheStrategy verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException {
String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8090") : targetSeed.getClusterAddress();
String urlBase = "http://" + address + "/yacysearch.rss";
return SearchSRURSS.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global, null);
return SRURSSConnector.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global, null);
}
@SuppressWarnings("unchecked")

@ -108,6 +108,11 @@ public class HTTPClient {
super();
}
public HTTPClient(final String defaultAgent) {
super();
HttpProtocolParams.setUserAgent(httpClient.getParams(), defaultAgent);
}
public static void setDefaultUserAgent(final String defaultAgent) {
HttpProtocolParams.setUserAgent(httpClient.getParams(), defaultAgent);
}
@ -320,7 +325,9 @@ public class HTTPClient {
if (currentRequest != null) throw new IOException("Client is in use!");
final MultiProtocolURI url = new MultiProtocolURI(uri);
final HttpPost httpPost = new HttpPost(url.toNormalform(true, false, true, false));
setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
String host = url.getHost();
if (host == null) host = "127.0.0.1";
setHost(host); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
final InputStreamEntity inputStreamEntity = new InputStreamEntity(instream, length);
// statistics
upbytes = length;
@ -340,7 +347,9 @@ public class HTTPClient {
public byte[] POSTbytes(final String uri, final Map<String, ContentBody> parts, final boolean usegzip) throws IOException {
final MultiProtocolURI url = new MultiProtocolURI(uri);
final HttpPost httpPost = new HttpPost(url.toNormalform(true, false, true, false));
setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
String host = url.getHost();
if (host == null) host = "127.0.0.1";
setHost(host); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
final MultipartEntity multipartEntity = new MultipartEntity();
for (final Entry<String,ContentBody> part : parts.entrySet())
@ -357,6 +366,42 @@ public class HTTPClient {
return getContentBytes(httpPost, Long.MAX_VALUE);
}
public byte[] POSTbytes(final String uri, final InputStream instream, long length) throws IOException {
final MultiProtocolURI url = new MultiProtocolURI(uri);
final HttpPost httpPost = new HttpPost(url.toNormalform(true, false, true, false));
String host = url.getHost();
if (host == null) host = "127.0.0.1";
setHost(host); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
final InputStreamEntity inputStreamEntity = new InputStreamEntity(instream, length);
// statistics
upbytes = length;
httpPost.setEntity(inputStreamEntity);
currentRequest = httpPost;
return getContentBytes(httpPost, Long.MAX_VALUE);
}
/**
* send data to the server named by vhost
*
* @param url address of the server
* @param vhost name of the server at address which should respond
* @param post data to send (name-value-pairs)
* @param usegzip if the body should be gzipped
* @return response body
* @throws IOException
*/
public byte[] POSTbytes(final MultiProtocolURI url, final String vhost, final Map<String, ContentBody> post, final boolean usegzip) throws IOException {
this.setHost(vhost);
byte[] b;
try {
b = this.POSTbytes(url.toNormalform(true, false, true, false), post, usegzip);
} finally {
this.finish();
}
return b;
}
/**
*
* @return HttpResponse from call
@ -528,7 +573,7 @@ public class HTTPClient {
private void storeConnectionInfo(final HttpUriRequest httpUriRequest) {
final int port = httpUriRequest.getURI().getPort();
final String thost = httpUriRequest.getURI().getHost();
assert thost != null : "uri = " + httpUriRequest.getURI().toString();
//assert thost != null : "uri = " + httpUriRequest.getURI().toString();
ConnectionInfo.addConnection(new ConnectionInfo(
httpUriRequest.getURI().getScheme(),
port == 80 ? thost : thost + ":" + port,

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services;
package net.yacy.cora.services.federated;
/**
* place-holder class to provide a object declaration for threads in Search object

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services;
package net.yacy.cora.services.federated;
import java.util.ArrayList;
import java.util.Iterator;
@ -34,6 +34,7 @@ import de.anomic.crawler.CrawlProfile;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.cora.storage.ConcurrentScoreMap;
import net.yacy.cora.storage.ScoreMap;
@ -149,7 +150,7 @@ public class SearchHub {
*/
public static void addSRURSSServices(SearchHub search, String[] rssServices, int count, CrawlProfile.CacheStrategy verify, boolean global, String userAgent) {
for (String service: rssServices) {
SearchSRURSS accumulator = new SearchSRURSS(search, service, count, verify, global, userAgent);
SRURSSConnector accumulator = new SRURSSConnector(search, service, count, verify, global, userAgent);
accumulator.start();
search.addAccumulator(accumulator);
}

@ -24,8 +24,9 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package de.anomic.search;
package net.yacy.cora.services.federated;
import de.anomic.search.ResultEntry;
import net.yacy.cora.storage.WeakPriorityBlockingQueue;
public class SearchResult extends WeakPriorityBlockingQueue<ResultEntry> {

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services;
package net.yacy.cora.services.federated.opensearch;
import java.io.IOException;
import java.net.MalformedURLException;
@ -44,8 +44,10 @@ import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.http.HTTPConnector;
import net.yacy.cora.services.federated.SearchAccumulator;
import net.yacy.cora.services.federated.SearchHub;
public class SearchSRURSS extends Thread implements SearchAccumulator {
public class SRURSSConnector extends Thread implements SearchAccumulator {
private final static int recordsPerSession = 100;
@ -60,7 +62,7 @@ public class SearchSRURSS extends Thread implements SearchAccumulator {
private final BlockingQueue<RSSMessage> results;
public SearchSRURSS(
public SRURSSConnector(
final Map<RSSMessage, List<Integer>> result,
final String query,
final long timeoutInit,
@ -80,7 +82,7 @@ public class SearchSRURSS extends Thread implements SearchAccumulator {
this.userAgent = userAgent;
}
public SearchSRURSS(
public SRURSSConnector(
final SearchHub search,
final String urlBase,
final int maximumRecordsInit,

@ -0,0 +1,590 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This file was part of the solrj package and used the apache http client 3.1
* It was modified and adopted to work with the apache http client 4.1
* using the net.yacy.cora connection package of YaCy
* Code modifications (C) under Apache License 2.0 by Michael Christen, 14.4.2011
*/
package net.yacy.cora.services.federated.solr;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import net.yacy.cora.protocol.http.HTTPClient;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.http.entity.mime.content.InputStreamBody;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.solr.client.solrj.ResponseParser;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.BinaryResponseParser;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.client.solrj.request.RequestWriter;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.DefaultSolrParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
/**
* The {@link SolrHTTPClient} uses the Apache Commons HTTP Client to connect to solr.
* <pre class="prettyprint" >SolrServer server = new CommonsHttpSolrServer( url );</pre>
*
* @version $Id: CommonsHttpSolrServer.java 1067552 2011-02-05 23:52:42Z koji $
* @since solr 1.3
*/
public class SolrHTTPClient extends SolrServer {
private static final long serialVersionUID = -4532572298724852268L;
/**
* User-Agent String as identified by the HTTP request by the {@link
* org.apache.commons.httpclient.HttpClient HttpClient} to the Solr
* server from the client.
*/
public static final String AGENT = "Solr["+SolrHTTPClient.class.getName()+"] 1.0";
public final static Charset utf8;
static {
utf8 = Charset.forName("UTF-8");
}
/**
* The URL of the Solr server.
*/
protected String _baseURL;
/**
* Default value: null / empty. <p/>
* Parameters that are added to every request regardless. This may be a place to add
* something like an authentication token.
*/
protected ModifiableSolrParams _invariantParams;
/**
* Default response parser is BinaryResponseParser <p/>
* This parser represents the default Response Parser chosen to
* parse the response if the parser were not specified as part of
* the request.
* @see org.apache.solr.client.solrj.impl.BinaryResponseParser
*/
protected ResponseParser _parser;
/**
* The RequestWriter used to write all requests to Solr
* @see org.apache.solr.client.solrj.request.RequestWriter
*/
protected RequestWriter requestWriter = new RequestWriter();
/**
* @param solrServerUrl The URL of the Solr server. For
* example, "<code>http://localhost:8983/solr/</code>"
* if you are using the standard distribution Solr webapp
* on your local machine.
*/
public SolrHTTPClient(String solrServerUrl) throws MalformedURLException {
this(new URL(solrServerUrl));
}
/**
* @param baseURL The URL of the Solr server. For example,
* "<code>http://localhost:8983/solr/</code>" if you are using the
* standard distribution Solr webapp on your local machine.
*/
public SolrHTTPClient(URL baseURL)
{
this(baseURL, new BinaryResponseParser());
}
/**
* @see #useMultiPartPost
* @see #_parser
*/
public SolrHTTPClient(URL baseURL, ResponseParser parser) {
_baseURL = baseURL.toExternalForm();
if( _baseURL.endsWith( "/" ) ) {
_baseURL = _baseURL.substring( 0, _baseURL.length()-1 );
}
if( _baseURL.indexOf( '?' ) >=0 ) {
throw new RuntimeException( "Invalid base url for solrj. The base URL must not contain parameters: "+_baseURL );
}
_parser = parser;
}
//------------------------------------------------------------------------
//------------------------------------------------------------------------
/**
* Process the request. If {@link org.apache.solr.client.solrj.SolrRequest#getResponseParser()} is null, then use
* {@link #getParser()}
* @param request The {@link org.apache.solr.client.solrj.SolrRequest} to process
* @return The {@link org.apache.solr.common.util.NamedList} result
* @throws SolrServerException
* @throws IOException
*
* @see #request(org.apache.solr.client.solrj.SolrRequest, org.apache.solr.client.solrj.ResponseParser)
*/
@Override
public NamedList<Object> request( final SolrRequest request ) throws SolrServerException, IOException
{
ResponseParser responseParser = request.getResponseParser();
if (responseParser == null) {
responseParser = _parser;
}
return request(request, responseParser);
}
public NamedList<Object> request(final SolrRequest request, ResponseParser processor) throws SolrServerException, IOException {
SolrParams params = request.getParams();
Collection<ContentStream> streams = requestWriter.getContentStreams(request);
String path = requestWriter.getPath(request);
if( path == null || !path.startsWith( "/" ) ) {
path = "/select";
}
// The parser 'wt=' and 'version=' params are used instead of the original params
ResponseParser parser = request.getResponseParser();
if( parser == null ) {
parser = _parser;
}
ModifiableSolrParams wparams = new ModifiableSolrParams();
wparams.set( CommonParams.WT, parser.getWriterType() );
wparams.set( CommonParams.VERSION, parser.getVersion());
if( params == null ) {
params = wparams;
}
else {
params = new DefaultSolrParams( wparams, params );
}
if( _invariantParams != null ) {
params = new DefaultSolrParams( _invariantParams, params );
}
byte[] result = null;
HTTPClient client = new HTTPClient();
if (SolrRequest.METHOD.POST == request.getMethod()) {
boolean isMultipart = ( streams != null && streams.size() > 1 );
if (streams == null || isMultipart) {
String url = _baseURL + path;
HashMap<String, ContentBody> parts = new HashMap<String, ContentBody>();
Iterator<String> iter = params.getParameterNamesIterator();
while (iter.hasNext()) {
String p = iter.next();
String[] vals = params.getParams(p);
if (vals != null) {
for (String v : vals) {
if (isMultipart) {
parts.put(p, new StringBody(v, utf8));
} else {
if (url.indexOf('?') >= 0) url += "&" + p + "=" + v; else url += "?" + p + "=" + v;
}
}
}
}
if (isMultipart) {
for (ContentStream content : streams) {
parts.put(content.getName(), new InputStreamBody(content.getStream(), content.getContentType(), null));
}
}
try {
result = client.POSTbytes(url, parts, true);
} finally {
client.finish();
}
} else {
String pstr = ClientUtils.toQueryString(params, false);
String url = _baseURL + path + pstr;
// Single stream as body
// Using a loop just to get the first one
final ContentStream[] contentStream = new ContentStream[1];
for (ContentStream content : streams) {
contentStream[0] = content;
break;
}
result = client.POSTbytes(url, contentStream[0].getStream(), contentStream[0].getStream().available());
}
} else if (SolrRequest.METHOD.GET == request.getMethod()) {
result = client.GETbytes( _baseURL + path + ClientUtils.toQueryString( params, false ));
} else {
throw new SolrServerException("Unsupported method: "+request.getMethod() );
}
int statusCode = client.getStatusCode();
if (statusCode != 200) {
throw new IOException("bad status code: " + statusCode + ", " + client.getHttpResponse().getStatusLine());
}
// Read the contents
//System.out.println("SOLR RESPONSE: " + UTF8.String(result));
InputStream respBody = new ByteArrayInputStream(result);
return processor.processResponse(respBody, "UTF-8");
}
/*
* The original code for the request method
public NamedList<Object> request(final SolrRequest request, ResponseParser processor) throws SolrServerException, IOException {
HttpMethod method = null;
InputStream is = null;
SolrParams params = request.getParams();
Collection<ContentStream> streams = requestWriter.getContentStreams(request);
String path = requestWriter.getPath(request);
if( path == null || !path.startsWith( "/" ) ) {
path = "/select";
}
ResponseParser parser = request.getResponseParser();
if( parser == null ) {
parser = _parser;
}
// The parser 'wt=' and 'version=' params are used instead of the original params
ModifiableSolrParams wparams = new ModifiableSolrParams();
wparams.set( CommonParams.WT, parser.getWriterType() );
wparams.set( CommonParams.VERSION, parser.getVersion());
if( params == null ) {
params = wparams;
}
else {
params = new DefaultSolrParams( wparams, params );
}
if( _invariantParams != null ) {
params = new DefaultSolrParams( _invariantParams, params );
}
int tries = _maxRetries + 1;
try {
while( tries-- > 0 ) {
// Note: since we aren't do intermittent time keeping
// ourselves, the potential non-timeout latency could be as
// much as tries-times (plus scheduling effects) the given
// timeAllowed.
try {
if( SolrRequest.METHOD.GET == request.getMethod() ) {
if( streams != null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "GET can't send streams!" );
}
method = new GetMethod( _baseURL + path + ClientUtils.toQueryString( params, false ) );
}
else if( SolrRequest.METHOD.POST == request.getMethod() ) {
String url = _baseURL + path;
boolean isMultipart = ( streams != null && streams.size() > 1 );
if (streams == null || isMultipart) {
PostMethod post = new PostMethod(url);
post.getParams().setContentCharset("UTF-8");
if (!this.useMultiPartPost && !isMultipart) {
post.addRequestHeader("Content-Type",
"application/x-www-form-urlencoded; charset=UTF-8");
}
List<Part> parts = new LinkedList<Part>();
Iterator<String> iter = params.getParameterNamesIterator();
while (iter.hasNext()) {
String p = iter.next();
String[] vals = params.getParams(p);
if (vals != null) {
for (String v : vals) {
if (this.useMultiPartPost || isMultipart) {
parts.add(new StringPart(p, v, "UTF-8"));
} else {
post.addParameter(p, v);
}
}
}
}
if (isMultipart) {
int i = 0;
for (ContentStream content : streams) {
final ContentStream c = content;
String charSet = null;
String transferEncoding = null;
parts.add(new PartBase(c.getName(), c.getContentType(),
charSet, transferEncoding) {
@Override
protected long lengthOfData() throws IOException {
return c.getSize();
}
@Override
protected void sendData(OutputStream out)
throws IOException {
InputStream in = c.getStream();
try {
IOUtils.copy(in, out);
} finally {
in.close();
}
}
});
}
}
if (parts.size() > 0) {
post.setRequestEntity(new MultipartRequestEntity(parts
.toArray(new Part[parts.size()]), post.getParams()));
}
method = post;
}
// It is has one stream, it is the post body, put the params in the URL
else {
String pstr = ClientUtils.toQueryString(params, false);
PostMethod post = new PostMethod(url + pstr);
// Single stream as body
// Using a loop just to get the first one
final ContentStream[] contentStream = new ContentStream[1];
for (ContentStream content : streams) {
contentStream[0] = content;
break;
}
if (contentStream[0] instanceof RequestWriter.LazyContentStream) {
post.setRequestEntity(new RequestEntity() {
public long getContentLength() {
return -1;
}
public String getContentType() {
return contentStream[0].getContentType();
}
public boolean isRepeatable() {
return false;
}
public void writeRequest(OutputStream outputStream) throws IOException {
((RequestWriter.LazyContentStream) contentStream[0]).writeTo(outputStream);
}
}
);
} else {
is = contentStream[0].getStream();
post.setRequestEntity(new InputStreamRequestEntity(is, contentStream[0].getContentType()));
}
method = post;
}
}
else {
throw new SolrServerException("Unsupported method: "+request.getMethod() );
}
}
catch( NoHttpResponseException r ) {
// This is generally safe to retry on
method.releaseConnection();
method = null;
if(is != null) {
is.close();
}
// If out of tries then just rethrow (as normal error).
if( ( tries < 1 ) ) {
throw r;
}
//log.warn( "Caught: " + r + ". Retrying..." );
}
}
}
catch( IOException ex ) {
throw new SolrServerException("error reading streams", ex );
}
method.setFollowRedirects( _followRedirects );
method.addRequestHeader( "User-Agent", AGENT );
if( _allowCompression ) {
method.setRequestHeader( new Header( "Accept-Encoding", "gzip,deflate" ) );
}
try {
// Execute the method.
//System.out.println( "EXECUTE:"+method.getURI() );
int statusCode = _httpClient.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
StringBuilder msg = new StringBuilder();
msg.append( method.getStatusLine().getReasonPhrase() );
msg.append( "\n\n" );
msg.append( method.getStatusText() );
msg.append( "\n\n" );
msg.append( "request: "+method.getURI() );
throw new SolrException(statusCode, java.net.URLDecoder.decode(msg.toString(), "UTF-8") );
}
// Read the contents
String charset = "UTF-8";
if( method instanceof HttpMethodBase ) {
charset = ((HttpMethodBase)method).getResponseCharSet();
}
InputStream respBody = method.getResponseBodyAsStream();
// Jakarta Commons HTTPClient doesn't handle any
// compression natively. Handle gzip or deflate
// here if applicable.
if( _allowCompression ) {
Header contentEncodingHeader = method.getResponseHeader( "Content-Encoding" );
if( contentEncodingHeader != null ) {
String contentEncoding = contentEncodingHeader.getValue();
if( contentEncoding.contains( "gzip" ) ) {
//log.debug( "wrapping response in GZIPInputStream" );
respBody = new GZIPInputStream( respBody );
}
else if( contentEncoding.contains( "deflate" ) ) {
//log.debug( "wrapping response in InflaterInputStream" );
respBody = new InflaterInputStream(respBody);
}
}
else {
Header contentTypeHeader = method.getResponseHeader( "Content-Type" );
if( contentTypeHeader != null ) {
String contentType = contentTypeHeader.getValue();
if( contentType != null ) {
if( contentType.startsWith( "application/x-gzip-compressed" ) ) {
//log.debug( "wrapping response in GZIPInputStream" );
respBody = new GZIPInputStream( respBody );
}
else if ( contentType.startsWith("application/x-deflate") ) {
//log.debug( "wrapping response in InflaterInputStream" );
respBody = new InflaterInputStream(respBody);
}
}
}
}
}
return processor.processResponse(respBody, charset);
}
catch (HttpException e) {
throw new SolrServerException( e );
}
catch (IOException e) {
throw new SolrServerException( e );
}
finally {
method.releaseConnection();
if(is != null) {
is.close();
}
}
}
*/
//-------------------------------------------------------------------
//-------------------------------------------------------------------
/**
* Retrieve the default list of parameters are added to every request regardless.
*
* @see #_invariantParams
*/
public ModifiableSolrParams getInvariantParams()
{
return _invariantParams;
}
public String getBaseURL() {
return _baseURL;
}
public void setBaseURL(String baseURL) {
this._baseURL = baseURL;
}
public ResponseParser getParser() {
return _parser;
}
/**
* Note: This setter method is <b>not thread-safe</b>.
* @param processor Default Response Parser chosen to parse the response if the parser were not specified as part of the request.
* @see org.apache.solr.client.solrj.SolrRequest#getResponseParser()
*/
public void setParser(ResponseParser processor) {
_parser = processor;
}
public void setRequestWriter(RequestWriter requestWriter) {
this.requestWriter = requestWriter;
}
/**
* Adds the documents supplied by the given iterator.
*
* @param docIterator the iterator which returns SolrInputDocument instances
*
* @return the response from the SolrServer
*/
public UpdateResponse add(Iterator<SolrInputDocument> docIterator)
throws SolrServerException, IOException {
UpdateRequest req = new UpdateRequest();
req.setDocIterator(docIterator);
return req.process(this);
}
/**
* Adds the beans supplied by the given iterator.
*
* @param beanIterator the iterator which returns Beans
*
* @return the response from the SolrServer
*/
public UpdateResponse addBeans(final Iterator<?> beanIterator)
throws SolrServerException, IOException {
UpdateRequest req = new UpdateRequest();
req.setDocIterator(new Iterator<SolrInputDocument>() {
public boolean hasNext() {
return beanIterator.hasNext();
}
public SolrInputDocument next() {
Object o = beanIterator.next();
if (o == null) return null;
return getBinder().toSolrInputDocument(o);
}
public void remove() {
beanIterator.remove();
}
});
return req.process(this);
}
}

@ -0,0 +1,106 @@
package net.yacy.cora.services.federated.solr;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.solr.common.SolrInputDocument;
public enum SolrScheme {
SolrCell,
DublinCore;
public SolrInputDocument yacy2solr(String id, Document document) {
if (this == SolrCell) return yacy2solrSolrCell(id, document);
return null;
}
public static SolrInputDocument yacy2solrSolrCell(String id, Document yacydoc) {
// we user the SolrCell design as index scheme
SolrInputDocument solrdoc = new SolrInputDocument();
DigestURI digestURI = new DigestURI(yacydoc.dc_source());
solrdoc.addField("id", id);
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
/*
*
private final MultiProtocolURI source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright
private final String publisher; // publisher
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds
private final Map<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private Map<MultiProtocolURI, String> hyperlinks, audiolinks, videolinks, applinks;
private Map<String, String> emaillinks;
private MultiProtocolURI favicon;
private boolean resorted;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
private boolean indexingDenied;
private float lon, lat;
*/
solrdoc.addField("title", yacydoc.dc_title());
solrdoc.addField("author", yacydoc.dc_creator());
solrdoc.addField("description", yacydoc.dc_description());
solrdoc.addField("content_type", yacydoc.dc_format());
solrdoc.addField("subject", yacydoc.dc_subject(' '));
solrdoc.addField("text", UTF8.String(yacydoc.getTextBytes()));
return solrdoc;
}
/*
* standard solr scheme
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
<field name="name" type="textgen" indexed="true" stored="true"/>
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
<field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/>
<field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<field name="weight" type="float" indexed="true" stored="true"/>
<field name="price" type="float" indexed="true" stored="true"/>
<field name="popularity" type="int" indexed="true" stored="true" />
<field name="inStock" type="boolean" indexed="true" stored="true" />
<!-- Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF.
Some fields are multiValued only because Tika currently may return
multiple values for them.
-->
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="subject" type="text" indexed="true" stored="true"/>
<field name="description" type="text" indexed="true" stored="true"/>
<field name="comments" type="text" indexed="true" stored="true"/>
<field name="author" type="textgen" indexed="true" stored="true"/>
<field name="keywords" type="textgen" indexed="true" stored="true"/>
<field name="category" type="textgen" indexed="true" stored="true"/>
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
*/
}

@ -0,0 +1,187 @@
package net.yacy.cora.services.federated.solr;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.document.Document;
public class SolrSingleConnector {
private String solrurl;
private SolrServer server;
private SolrScheme scheme;
public SolrSingleConnector(String url, SolrScheme scheme) throws IOException {
this.solrurl = url;
this.scheme = scheme;
try {
this.server = new SolrHTTPClient(this.solrurl);
} catch (MalformedURLException e) {
throw new IOException("bad connector url: " + this.solrurl);
}
}
/**
* delete everything in the solr index
* @throws IOException
*/
public void clear() throws IOException {
try {
server.deleteByQuery("*:*");
} catch (SolrServerException e) {
throw new IOException(e);
}
}
public void delete(String id) throws IOException {
try {
server.deleteById(id);
} catch (SolrServerException e) {
throw new IOException(e);
}
}
public void delete(List<String> ids) throws IOException {
try {
server.deleteById(ids);
} catch (SolrServerException e) {
throw new IOException(e);
}
}
public void add(File file, String solrId) throws IOException {
ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
up.addFile(file);
up.setParam("literal.id", solrId);
up.setParam("uprefix", "attr_");
up.setParam("fmap.content", "attr_content");
//up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);
try {
server.request(up);
server.commit();
} catch (SolrServerException e) {
throw new IOException(e);
}
}
/*
public void addx(File file, String solrId) throws IOException {
ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
ModifiableSolrParams params = new ModifiableSolrParams();
List<ContentStream> contentStreams = new ArrayList<ContentStream>();
contentStreams.add(new ContentStreamBase.FileStream(file));
params.set("literal.id", solrId);
params.set("uprefix", "attr_");
params.set("fmap.content", "attr_content");
params.set( UpdateParams.COMMIT, "true" );
params.set( UpdateParams.WAIT_FLUSH, String.valueOf(true));
params.set( UpdateParams.WAIT_SEARCHER, String.valueOf(true));
try {
server.
server.request(up);
} catch (SolrServerException e) {
throw new IOException(e);
}
}
*/
public void add(String id, Document doc) throws IOException {
add(id, doc, this.scheme);
}
public void add(String id, Document doc, SolrScheme tempScheme) throws IOException {
addSolr(tempScheme.yacy2solr(id, doc));
}
protected void addSolr(SolrInputDocument doc) throws IOException {
Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
docs.add(doc);
addSolr(docs);
}
protected void addSolr(Collection<SolrInputDocument> docs) throws IOException {
try {
server.add(docs);
server.commit();
/* To immediately commit after adding documents, you could use:
UpdateRequest req = new UpdateRequest();
req.setAction( UpdateRequest.ACTION.COMMIT, false, false );
req.add( docs );
UpdateResponse rsp = req.process( server );
*/
} catch (SolrServerException e) {
throw new IOException(e);
}
}
/**
* get a query result from solr
* to get all results set the query String to "*:*"
* @param querystring
* @throws IOException
*/
public SolrDocumentList get(String querystring, int offset, int count) throws IOException {
// construct query
SolrQuery query = new SolrQuery();
query.setQuery(querystring);
query.setRows(count);
query.setStart(offset);
query.addSortField( "price", SolrQuery.ORDER.asc );
// query the server
//SearchResult result = new SearchResult(count);
try {
QueryResponse rsp = server.query( query );
SolrDocumentList docs = rsp.getResults();
return docs;
// add the docs into the YaCy search result container
/*
for (SolrDocument doc: docs) {
result.put(element)
}
*/
} catch (SolrServerException e) {
throw new IOException(e);
}
//return result;
}
public static void main(String args[]) {
SolrSingleConnector solr;
try {
solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", SolrScheme.SolrCell);
solr.clear();
File exampleDir = new File("/Data/workspace2/yacy/test/parsertest/");
long t, t0, a = 0;
int c = 0;
for (String s: exampleDir.list()) {
if (s.startsWith(".")) continue;
t = System.currentTimeMillis();
solr.add(new File(exampleDir, s), s);
t0 = (System.currentTimeMillis() - t);
a += t0;
c++;
System.out.println("pushed file " + s + " to solr, " + t0 + " milliseconds");
}
System.out.println("pushed " + c + " files in " + a + " milliseconds, " + (a / c) + " milliseconds average; " + (60000 / a * c) + " PPM");
} catch (IOException e) {
e.printStackTrace();
}
}
}
Loading…
Cancel
Save