commit
a48f37bf70
@ -1,620 +0,0 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file was part of the solrj package and used the apache http client 3.1
|
||||
* It was modified and adopted to work with the apache http client 4.1
|
||||
* using the net.yacy.cora connection package of YaCy
|
||||
* Code modifications (C) under Apache License 2.0 by Michael Christen, 14.4.2011
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services.federated.solr;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
|
||||
import org.apache.http.entity.mime.content.ContentBody;
|
||||
import org.apache.http.entity.mime.content.InputStreamBody;
|
||||
import org.apache.http.entity.mime.content.StringBody;
|
||||
import org.apache.solr.client.solrj.ResponseParser;
|
||||
import org.apache.solr.client.solrj.SolrRequest;
|
||||
import org.apache.solr.client.solrj.SolrServer;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.BinaryResponseParser;
|
||||
import org.apache.solr.client.solrj.request.RequestWriter;
|
||||
import org.apache.solr.client.solrj.request.UpdateRequest;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.apache.solr.client.solrj.util.ClientUtils;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
||||
/**
|
||||
* The {@link SolrHTTPClient} uses the Apache Commons HTTP Client to connect to solr.
|
||||
* <pre class="prettyprint" >SolrServer server = new CommonsHttpSolrServer( url );</pre>
|
||||
*
|
||||
* @version $Id: CommonsHttpSolrServer.java 1067552 2011-02-05 23:52:42Z koji $
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public class SolrHTTPClient extends SolrServer {
|
||||
private static final long serialVersionUID = -4532572298724852268L;
|
||||
|
||||
/**
|
||||
* User-Agent String as identified by the HTTP request by the {@link
|
||||
* org.apache.commons.httpclient.HttpClient HttpClient} to the Solr
|
||||
* server from the client.
|
||||
*/
|
||||
public static final String AGENT = "Solr["+SolrHTTPClient.class.getName()+"] 1.0";
|
||||
|
||||
public final static Charset utf8;
|
||||
static {
|
||||
utf8 = Charset.forName("UTF-8");
|
||||
}
|
||||
|
||||
/**
|
||||
* The URL of the Solr server.
|
||||
*/
|
||||
protected String _baseURL, host, solraccount, solrpw;
|
||||
protected int port;
|
||||
|
||||
/**
|
||||
* Default value: null / empty. <p/>
|
||||
* Parameters that are added to every request regardless. This may be a place to add
|
||||
* something like an authentication token.
|
||||
*/
|
||||
protected ModifiableSolrParams _invariantParams;
|
||||
|
||||
/**
|
||||
* Default response parser is BinaryResponseParser <p/>
|
||||
* This parser represents the default Response Parser chosen to
|
||||
* parse the response if the parser were not specified as part of
|
||||
* the request.
|
||||
* @see org.apache.solr.client.solrj.impl.BinaryResponseParser
|
||||
*/
|
||||
protected ResponseParser _parser;
|
||||
|
||||
/**
|
||||
* The RequestWriter used to write all requests to Solr
|
||||
* @see org.apache.solr.client.solrj.request.RequestWriter
|
||||
*/
|
||||
protected RequestWriter requestWriter = new RequestWriter();
|
||||
|
||||
/**
|
||||
* @param solrServerUrl The URL of the Solr server. For
|
||||
* example, "<code>http://localhost:8983/solr/</code>"
|
||||
* if you are using the standard distribution Solr webapp
|
||||
* on your local machine.
|
||||
*/
|
||||
public SolrHTTPClient(final String solrServerUrl) throws MalformedURLException {
|
||||
this(new URL(solrServerUrl));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param baseURL The URL of the Solr server. For example,
|
||||
* "<code>http://localhost:8983/solr/</code>" if you are using the
|
||||
* standard distribution Solr webapp on your local machine.
|
||||
*/
|
||||
public SolrHTTPClient(final URL baseURL)
|
||||
{
|
||||
this(baseURL, new BinaryResponseParser());
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #useMultiPartPost
|
||||
* @see #_parser
|
||||
*/
|
||||
public SolrHTTPClient(final URL baseURL, final ResponseParser parser) {
|
||||
this._baseURL = baseURL.toExternalForm();
|
||||
if( this._baseURL.endsWith( "/" ) ) {
|
||||
this._baseURL = this._baseURL.substring( 0, this._baseURL.length()-1 );
|
||||
}
|
||||
if( this._baseURL.indexOf( '?' ) >=0 ) {
|
||||
throw new RuntimeException( "Invalid base url for solrj. The base URL must not contain parameters: "+this._baseURL );
|
||||
}
|
||||
|
||||
MultiProtocolURI u;
|
||||
try {
|
||||
u = new MultiProtocolURI(this._baseURL.toString());
|
||||
this.host = u.getHost();
|
||||
this.port = u.getPort();
|
||||
final String userinfo = u.getUserInfo();
|
||||
if (userinfo == null || userinfo.length() == 0) {
|
||||
this.solraccount = ""; this.solrpw = "";
|
||||
} else {
|
||||
final int p = userinfo.indexOf(':');
|
||||
if (p < 0) {
|
||||
this.solraccount = userinfo; this.solrpw = "";
|
||||
} else {
|
||||
this.solraccount = userinfo.substring(0, p); this.solrpw = userinfo.substring(p + 1);
|
||||
}
|
||||
}
|
||||
} catch (final MalformedURLException e) {
|
||||
this.solraccount = ""; this.solrpw = "";
|
||||
this.host = ""; this.port = -1;
|
||||
}
|
||||
|
||||
this._parser = parser;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Process the request. If {@link org.apache.solr.client.solrj.SolrRequest#getResponseParser()} is null, then use
|
||||
* {@link #getParser()}
|
||||
* @param request The {@link org.apache.solr.client.solrj.SolrRequest} to process
|
||||
* @return The {@link org.apache.solr.common.util.NamedList} result
|
||||
* @throws SolrServerException
|
||||
* @throws IOException
|
||||
*
|
||||
* @see #request(org.apache.solr.client.solrj.SolrRequest, org.apache.solr.client.solrj.ResponseParser)
|
||||
*/
|
||||
@Override
|
||||
public NamedList<Object> request( final SolrRequest request ) throws SolrServerException, IOException
|
||||
{
|
||||
ResponseParser responseParser = request.getResponseParser();
|
||||
if (responseParser == null) {
|
||||
responseParser = this._parser;
|
||||
}
|
||||
return request(request, responseParser);
|
||||
}
|
||||
|
||||
|
||||
public NamedList<Object> request(final SolrRequest request, final ResponseParser processor) throws SolrServerException, IOException {
|
||||
SolrParams params = request.getParams();
|
||||
final Collection<ContentStream> streams = this.requestWriter.getContentStreams(request);
|
||||
String path = this.requestWriter.getPath(request);
|
||||
if( path == null || !path.startsWith( "/" ) ) {
|
||||
path = "/select";
|
||||
}
|
||||
|
||||
// The parser 'wt=' and 'version=' params are used instead of the original params
|
||||
ResponseParser parser = request.getResponseParser();
|
||||
if( parser == null ) {
|
||||
parser = this._parser;
|
||||
}
|
||||
final ModifiableSolrParams wparams = new ModifiableSolrParams();
|
||||
wparams.set( CommonParams.WT, parser.getWriterType() );
|
||||
wparams.set( CommonParams.VERSION, parser.getVersion());
|
||||
if( params == null ) {
|
||||
params = wparams;
|
||||
}
|
||||
else {
|
||||
params = SolrParams.wrapDefaults(wparams, params);
|
||||
}
|
||||
|
||||
if( this._invariantParams != null ) {
|
||||
params = SolrParams.wrapDefaults( this._invariantParams, params );
|
||||
}
|
||||
|
||||
|
||||
byte[] result = null;
|
||||
final HTTPClient client = new HTTPClient();
|
||||
if (this.solraccount.length() > 0 && this.solrpw.length() > 0 && this.host.length() > 0) {
|
||||
HTTPClient.setAuth(this.host, this.port, this.solraccount, this.solrpw);
|
||||
}
|
||||
|
||||
if (SolrRequest.METHOD.POST == request.getMethod()) {
|
||||
final boolean isMultipart = ( streams != null && streams.size() > 1 );
|
||||
if (streams == null || isMultipart) {
|
||||
String url = this._baseURL + path;
|
||||
|
||||
final HashMap<String, ContentBody> parts = new HashMap<String, ContentBody>();
|
||||
final Iterator<String> iter = params.getParameterNamesIterator();
|
||||
while (iter.hasNext()) {
|
||||
final String p = iter.next();
|
||||
final String[] vals = params.getParams(p);
|
||||
if (vals != null) {
|
||||
for (final String v : vals) {
|
||||
if (isMultipart) {
|
||||
parts.put(p, new StringBody(v, utf8));
|
||||
} else {
|
||||
if (url.indexOf('?') >= 0) url += "&" + p + "=" + v; else url += "?" + p + "=" + v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isMultipart) {
|
||||
for (final ContentStream content : streams) {
|
||||
parts.put(content.getName(), new InputStreamBody(content.getStream(), content.getContentType(), null));
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
result = client.POSTbytes(url, parts, true);
|
||||
} finally {
|
||||
client.finish();
|
||||
}
|
||||
} else {
|
||||
// It has one stream, this is the post body, put the params in the URL
|
||||
final String pstr = ClientUtils.toQueryString(params, false);
|
||||
final String url = this._baseURL + path + pstr;
|
||||
|
||||
// Single stream as body
|
||||
// Using a loop just to get the first one
|
||||
final ContentStream[] contentStream = new ContentStream[1];
|
||||
for (final ContentStream content : streams) {
|
||||
contentStream[0] = content;
|
||||
break;
|
||||
}
|
||||
result = client.POSTbytes(url, contentStream[0].getStream(), contentStream[0].getStream().available());
|
||||
}
|
||||
} else if (SolrRequest.METHOD.GET == request.getMethod()) {
|
||||
result = client.GETbytes( this._baseURL + path + ClientUtils.toQueryString( params, false ));
|
||||
} else {
|
||||
throw new SolrServerException("Unsupported method: "+request.getMethod() );
|
||||
}
|
||||
|
||||
final int statusCode = client.getStatusCode();
|
||||
if (statusCode != 200) {
|
||||
throw new IOException("bad status code: " + statusCode + ", " + client.getHttpResponse().getStatusLine() + ", url = " + this._baseURL + path);
|
||||
}
|
||||
|
||||
// Read the contents
|
||||
//System.out.println("SOLR RESPONSE: " + UTF8.String(result));
|
||||
final InputStream respBody = new ByteArrayInputStream(result);
|
||||
return processor.processResponse(respBody, "UTF-8");
|
||||
}
|
||||
|
||||
/*
|
||||
* The original code for the request method
|
||||
public NamedList<Object> request(final SolrRequest request, ResponseParser processor) throws SolrServerException, IOException {
|
||||
HttpMethod method = null;
|
||||
InputStream is = null;
|
||||
SolrParams params = request.getParams();
|
||||
Collection<ContentStream> streams = requestWriter.getContentStreams(request);
|
||||
String path = requestWriter.getPath(request);
|
||||
if( path == null || !path.startsWith( "/" ) ) {
|
||||
path = "/select";
|
||||
}
|
||||
|
||||
ResponseParser parser = request.getResponseParser();
|
||||
if( parser == null ) {
|
||||
parser = _parser;
|
||||
}
|
||||
|
||||
// The parser 'wt=' and 'version=' params are used instead of the original params
|
||||
ModifiableSolrParams wparams = new ModifiableSolrParams();
|
||||
wparams.set( CommonParams.WT, parser.getWriterType() );
|
||||
wparams.set( CommonParams.VERSION, parser.getVersion());
|
||||
if( params == null ) {
|
||||
params = wparams;
|
||||
}
|
||||
else {
|
||||
params = new DefaultSolrParams( wparams, params );
|
||||
}
|
||||
|
||||
if( _invariantParams != null ) {
|
||||
params = new DefaultSolrParams( _invariantParams, params );
|
||||
}
|
||||
|
||||
int tries = _maxRetries + 1;
|
||||
try {
|
||||
while( tries-- > 0 ) {
|
||||
// Note: since we aren't do intermittent time keeping
|
||||
// ourselves, the potential non-timeout latency could be as
|
||||
// much as tries-times (plus scheduling effects) the given
|
||||
// timeAllowed.
|
||||
try {
|
||||
if( SolrRequest.METHOD.GET == request.getMethod() ) {
|
||||
if( streams != null ) {
|
||||
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "GET can't send streams!" );
|
||||
}
|
||||
method = new GetMethod( _baseURL + path + ClientUtils.toQueryString( params, false ) );
|
||||
}
|
||||
else if( SolrRequest.METHOD.POST == request.getMethod() ) {
|
||||
|
||||
String url = _baseURL + path;
|
||||
boolean isMultipart = ( streams != null && streams.size() > 1 );
|
||||
|
||||
if (streams == null || isMultipart) {
|
||||
PostMethod post = new PostMethod(url);
|
||||
post.getParams().setContentCharset("UTF-8");
|
||||
if (!this.useMultiPartPost && !isMultipart) {
|
||||
post.addRequestHeader("Content-Type",
|
||||
"application/x-www-form-urlencoded; charset=UTF-8");
|
||||
}
|
||||
|
||||
List<Part> parts = new LinkedList<Part>();
|
||||
Iterator<String> iter = params.getParameterNamesIterator();
|
||||
while (iter.hasNext()) {
|
||||
String p = iter.next();
|
||||
String[] vals = params.getParams(p);
|
||||
if (vals != null) {
|
||||
for (String v : vals) {
|
||||
if (this.useMultiPartPost || isMultipart) {
|
||||
parts.add(new StringPart(p, v, "UTF-8"));
|
||||
} else {
|
||||
post.addParameter(p, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isMultipart) {
|
||||
int i = 0;
|
||||
for (ContentStream content : streams) {
|
||||
final ContentStream c = content;
|
||||
|
||||
String charSet = null;
|
||||
String transferEncoding = null;
|
||||
parts.add(new PartBase(c.getName(), c.getContentType(),
|
||||
charSet, transferEncoding) {
|
||||
@Override
|
||||
protected long lengthOfData() throws IOException {
|
||||
return c.getSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void sendData(OutputStream out)
|
||||
throws IOException {
|
||||
InputStream in = c.getStream();
|
||||
try {
|
||||
IOUtils.copy(in, out);
|
||||
} finally {
|
||||
in.close();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
if (parts.size() > 0) {
|
||||
post.setRequestEntity(new MultipartRequestEntity(parts
|
||||
.toArray(new Part[parts.size()]), post.getParams()));
|
||||
}
|
||||
|
||||
method = post;
|
||||
}
|
||||
// It is has one stream, it is the post body, put the params in the URL
|
||||
else {
|
||||
String pstr = ClientUtils.toQueryString(params, false);
|
||||
PostMethod post = new PostMethod(url + pstr);
|
||||
|
||||
// Single stream as body
|
||||
// Using a loop just to get the first one
|
||||
final ContentStream[] contentStream = new ContentStream[1];
|
||||
for (ContentStream content : streams) {
|
||||
contentStream[0] = content;
|
||||
break;
|
||||
}
|
||||
if (contentStream[0] instanceof RequestWriter.LazyContentStream) {
|
||||
post.setRequestEntity(new RequestEntity() {
|
||||
public long getContentLength() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
public String getContentType() {
|
||||
return contentStream[0].getContentType();
|
||||
}
|
||||
|
||||
public boolean isRepeatable() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void writeRequest(OutputStream outputStream) throws IOException {
|
||||
((RequestWriter.LazyContentStream) contentStream[0]).writeTo(outputStream);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
} else {
|
||||
is = contentStream[0].getStream();
|
||||
post.setRequestEntity(new InputStreamRequestEntity(is, contentStream[0].getContentType()));
|
||||
}
|
||||
method = post;
|
||||
}
|
||||
}
|
||||
else {
|
||||
throw new SolrServerException("Unsupported method: "+request.getMethod() );
|
||||
}
|
||||
}
|
||||
catch( NoHttpResponseException r ) {
|
||||
// This is generally safe to retry on
|
||||
method.releaseConnection();
|
||||
method = null;
|
||||
if(is != null) {
|
||||
is.close();
|
||||
}
|
||||
// If out of tries then just rethrow (as normal error).
|
||||
if( ( tries < 1 ) ) {
|
||||
throw r;
|
||||
}
|
||||
//log.warn( "Caught: " + r + ". Retrying..." );
|
||||
}
|
||||
}
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new SolrServerException("error reading streams", ex );
|
||||
}
|
||||
|
||||
method.setFollowRedirects( _followRedirects );
|
||||
method.addRequestHeader( "User-Agent", AGENT );
|
||||
if( _allowCompression ) {
|
||||
method.setRequestHeader( new Header( "Accept-Encoding", "gzip,deflate" ) );
|
||||
}
|
||||
|
||||
try {
|
||||
// Execute the method.
|
||||
//System.out.println( "EXECUTE:"+method.getURI() );
|
||||
|
||||
int statusCode = _httpClient.executeMethod(method);
|
||||
if (statusCode != HttpStatus.SC_OK) {
|
||||
StringBuilder msg = new StringBuilder();
|
||||
msg.append( method.getStatusLine().getReasonPhrase() );
|
||||
msg.append( "\n\n" );
|
||||
msg.append( method.getStatusText() );
|
||||
msg.append( "\n\n" );
|
||||
msg.append( "request: "+method.getURI() );
|
||||
throw new SolrException(statusCode, java.net.URLDecoder.decode(msg.toString(), "UTF-8") );
|
||||
}
|
||||
|
||||
// Read the contents
|
||||
String charset = "UTF-8";
|
||||
if( method instanceof HttpMethodBase ) {
|
||||
charset = ((HttpMethodBase)method).getResponseCharSet();
|
||||
}
|
||||
InputStream respBody = method.getResponseBodyAsStream();
|
||||
// Jakarta Commons HTTPClient doesn't handle any
|
||||
// compression natively. Handle gzip or deflate
|
||||
// here if applicable.
|
||||
if( _allowCompression ) {
|
||||
Header contentEncodingHeader = method.getResponseHeader( "Content-Encoding" );
|
||||
if( contentEncodingHeader != null ) {
|
||||
String contentEncoding = contentEncodingHeader.getValue();
|
||||
if( contentEncoding.contains( "gzip" ) ) {
|
||||
//log.debug( "wrapping response in GZIPInputStream" );
|
||||
respBody = new GZIPInputStream( respBody );
|
||||
}
|
||||
else if( contentEncoding.contains( "deflate" ) ) {
|
||||
//log.debug( "wrapping response in InflaterInputStream" );
|
||||
respBody = new InflaterInputStream(respBody);
|
||||
}
|
||||
}
|
||||
else {
|
||||
Header contentTypeHeader = method.getResponseHeader( "Content-Type" );
|
||||
if( contentTypeHeader != null ) {
|
||||
String contentType = contentTypeHeader.getValue();
|
||||
if( contentType != null ) {
|
||||
if( contentType.startsWith( "application/x-gzip-compressed" ) ) {
|
||||
//log.debug( "wrapping response in GZIPInputStream" );
|
||||
respBody = new GZIPInputStream( respBody );
|
||||
}
|
||||
else if ( contentType.startsWith("application/x-deflate") ) {
|
||||
//log.debug( "wrapping response in InflaterInputStream" );
|
||||
respBody = new InflaterInputStream(respBody);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return processor.processResponse(respBody, charset);
|
||||
}
|
||||
catch (HttpException e) {
|
||||
throw new SolrServerException( e );
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new SolrServerException( e );
|
||||
}
|
||||
finally {
|
||||
method.releaseConnection();
|
||||
if(is != null) {
|
||||
is.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
//-------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Retrieve the default list of parameters are added to every request regardless.
|
||||
*
|
||||
* @see #_invariantParams
|
||||
*/
|
||||
public ModifiableSolrParams getInvariantParams()
|
||||
{
|
||||
return this._invariantParams;
|
||||
}
|
||||
|
||||
public String getBaseURL() {
|
||||
return this._baseURL;
|
||||
}
|
||||
|
||||
public void setBaseURL(final String baseURL) {
|
||||
this._baseURL = baseURL;
|
||||
}
|
||||
|
||||
public ResponseParser getParser() {
|
||||
return this._parser;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: This setter method is <b>not thread-safe</b>.
|
||||
* @param processor Default Response Parser chosen to parse the response if the parser were not specified as part of the request.
|
||||
* @see org.apache.solr.client.solrj.SolrRequest#getResponseParser()
|
||||
*/
|
||||
public void setParser(final ResponseParser processor) {
|
||||
this._parser = processor;
|
||||
}
|
||||
|
||||
|
||||
public void setRequestWriter(final RequestWriter requestWriter) {
|
||||
this.requestWriter = requestWriter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the documents supplied by the given iterator.
|
||||
*
|
||||
* @param docIterator the iterator which returns SolrInputDocument instances
|
||||
*
|
||||
* @return the response from the SolrServer
|
||||
*/
|
||||
public UpdateResponse add(final Iterator<SolrInputDocument> docIterator)
|
||||
throws SolrServerException, IOException {
|
||||
final UpdateRequest req = new UpdateRequest();
|
||||
req.setDocIterator(docIterator);
|
||||
return req.process(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the beans supplied by the given iterator.
|
||||
*
|
||||
* @param beanIterator the iterator which returns Beans
|
||||
*
|
||||
* @return the response from the SolrServer
|
||||
*/
|
||||
public UpdateResponse addBeans(final Iterator<?> beanIterator)
|
||||
throws SolrServerException, IOException {
|
||||
final UpdateRequest req = new UpdateRequest();
|
||||
req.setDocIterator(new Iterator<SolrInputDocument>() {
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return beanIterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrInputDocument next() {
|
||||
final Object o = beanIterator.next();
|
||||
if (o == null) return null;
|
||||
return getBinder().toSolrInputDocument(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
beanIterator.remove();
|
||||
}
|
||||
});
|
||||
return req.process(this);
|
||||
}
|
||||
}
|
@ -0,0 +1,91 @@
|
||||
package net.yacy.cora.storage;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.AbstractMap;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
public class ZIPReader extends AbstractMap<String, ZipEntry> implements Map<String, ZipEntry>, Iterable<Map.Entry<String, ZipEntry>> {
|
||||
|
||||
private final Set<String> filenames;
|
||||
private final ZipFile zipFile;
|
||||
|
||||
public ZIPReader(File file) throws IOException {
|
||||
super();
|
||||
if (!file.exists()) throw new IOException("ZIPWriter can only be used for existing files");
|
||||
this.zipFile = new ZipFile(file);
|
||||
|
||||
// read all entries
|
||||
this.filenames = new HashSet<String>();
|
||||
final Enumeration<? extends ZipEntry> e = this.zipFile.entries();
|
||||
while (e.hasMoreElements()) {
|
||||
ZipEntry z = e.nextElement();
|
||||
this.filenames.add(z.getName());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<java.util.Map.Entry<String, ZipEntry>> iterator() {
|
||||
final Enumeration<? extends ZipEntry> e = this.zipFile.entries();
|
||||
return new Iterator<java.util.Map.Entry<String, ZipEntry>>() {
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return e.hasMoreElements();
|
||||
}
|
||||
|
||||
@Override
|
||||
public java.util.Map.Entry<String, ZipEntry> next() {
|
||||
ZipEntry z = e.nextElement();
|
||||
return new AbstractMap.SimpleImmutableEntry<String, ZipEntry>(z.getName(), z);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return this.zipFile.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return this.zipFile.size() == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean containsKey(Object key) {
|
||||
return this.filenames.contains(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ZipEntry get(Object key) {
|
||||
return this.zipFile.getEntry((String) key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<String> keySet() {
|
||||
return this.filenames;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<java.util.Map.Entry<String, ZipEntry>> entrySet() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
this.zipFile.close();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
package net.yacy.cora.storage;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.AbstractMap;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
public class ZIPWriter extends AbstractMap<String, ZipEntry> implements Map<String, ZipEntry>, Iterable<Map.Entry<String, ZipEntry>> {
|
||||
|
||||
private final HashMap<String, ZipEntry> backup;
|
||||
private final ZipOutputStream zos;
|
||||
|
||||
public ZIPWriter(File file) throws IOException {
|
||||
super();
|
||||
if (file.exists()) throw new IOException("ZIPWriter can only be used for new files");
|
||||
this.backup = new HashMap<String, ZipEntry>();
|
||||
this.zos = new ZipOutputStream(new FileOutputStream(file));
|
||||
}
|
||||
|
||||
@Override
|
||||
public ZipEntry put(String key, ZipEntry value) {
|
||||
assert !this.backup.containsKey(key);
|
||||
try {
|
||||
this.zos.putNextEntry(value);
|
||||
this.backup.put(key, value);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ZipEntry get(Object key) {
|
||||
return this.backup.get(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<java.util.Map.Entry<String, ZipEntry>> iterator() {
|
||||
return this.backup.entrySet().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<java.util.Map.Entry<String, ZipEntry>> entrySet() {
|
||||
return this.backup.entrySet();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
this.zos.close();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,390 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* this parser was copied and modified to fit into YaCy from the apache tika project
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.document.parser;
|
||||
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.kelondro.util.MemoryControl;
|
||||
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
|
||||
public class dwgParser extends AbstractParser implements Parser {
|
||||
|
||||
|
||||
private static final String HEADER_2000_PROPERTIES_MARKER_STR = "DWGPROPS COOKIE";
|
||||
private static final byte[] HEADER_2000_PROPERTIES_MARKER = new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
|
||||
|
||||
static {
|
||||
StringUtil.putCompressedUnicode(
|
||||
HEADER_2000_PROPERTIES_MARKER_STR,
|
||||
HEADER_2000_PROPERTIES_MARKER, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* How far to skip after the last standard property, before
|
||||
* we find any custom properties that might be there.
|
||||
*/
|
||||
private static final int CUSTOM_PROPERTIES_SKIP = 20;
|
||||
|
||||
public dwgParser() {
|
||||
super("DWG (CAD Drawing) parser (very basic)");
|
||||
this.SUPPORTED_EXTENSIONS.add("dwg");
|
||||
this.SUPPORTED_MIME_TYPES.add("application/dwg");
|
||||
this.SUPPORTED_MIME_TYPES.add("applications/vnd.dwg");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
|
||||
|
||||
// check memory for parser
|
||||
if (!MemoryControl.request(200 * 1024 * 1024, true))
|
||||
throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);
|
||||
return null;
|
||||
// First up, which version of the format are we handling?
|
||||
/*
|
||||
byte[] header = new byte[128];
|
||||
IOUtils.readFully(source, header);
|
||||
String version = new String(header, 0, 6, "US-ASCII");
|
||||
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
|
||||
xhtml.startDocument();
|
||||
|
||||
if (version.equals("AC1015")) {
|
||||
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
|
||||
if (skipTo2000PropertyInfoSection(stream, header)) {
|
||||
get2000Props(stream,metadata,xhtml);
|
||||
}
|
||||
} else if (version.equals("AC1018")) {
|
||||
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
|
||||
if (skipToPropertyInfoSection(stream, header)) {
|
||||
get2004Props(stream,metadata,xhtml);
|
||||
}
|
||||
} else if (version.equals("AC1021") || version.equals("AC1024")) {
|
||||
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
|
||||
if (skipToPropertyInfoSection(stream, header)) {
|
||||
get2007and2010Props(stream,metadata,xhtml);
|
||||
}
|
||||
} else {
|
||||
throw new TikaException(
|
||||
"Unsupported AutoCAD drawing version: " + version);
|
||||
}
|
||||
|
||||
xhtml.endDocument();
|
||||
|
||||
|
||||
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
|
||||
if (info != null) {
|
||||
docTitle = info.getTitle();
|
||||
docSubject = info.getSubject();
|
||||
docAuthor = info.getAuthor();
|
||||
docPublisher = info.getProducer();
|
||||
if (docPublisher == null || docPublisher.length() == 0) docPublisher = info.getCreator();
|
||||
docKeywordStr = info.getKeywords();
|
||||
}
|
||||
|
||||
if (docTitle == null || docTitle.length() == 0) {
|
||||
docTitle = MultiProtocolURI.unescape(location.getFileName());
|
||||
}
|
||||
|
||||
String[] docKeywords = null;
|
||||
if (docKeywordStr != null) {
|
||||
docKeywords = docKeywordStr.split(" |,");
|
||||
}
|
||||
if (docTitle == null) {
|
||||
docTitle = docSubject;
|
||||
}
|
||||
|
||||
byte[] contentBytes;
|
||||
|
||||
return new Document[]{new Document(
|
||||
location,
|
||||
mimeType,
|
||||
"UTF-8",
|
||||
this,
|
||||
null,
|
||||
docKeywords,
|
||||
docTitle,
|
||||
docAuthor,
|
||||
docPublisher,
|
||||
null,
|
||||
null,
|
||||
0.0f, 0.0f,
|
||||
contentBytes,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
false)};
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
private void get2004Props(
|
||||
InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
|
||||
throws IOException, TikaException, SAXException {
|
||||
// Standard properties
|
||||
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
|
||||
String headerValue = read2004String(stream);
|
||||
handleHeader(i, headerValue, metadata, xhtml);
|
||||
}
|
||||
|
||||
// Custom properties
|
||||
int customCount = skipToCustomProperties(stream);
|
||||
for (int i = 0; i < customCount; i++) {
|
||||
String propName = read2004String(stream);
|
||||
String propValue = read2004String(stream);
|
||||
if(propName.length() > 0 && propValue.length() > 0) {
|
||||
metadata.add(propName, propValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String read2004String(InputStream stream) throws IOException, TikaException {
|
||||
int stringLen = EndianUtils.readUShortLE(stream);
|
||||
|
||||
byte[] stringData = new byte[stringLen];
|
||||
IOUtils.readFully(stream, stringData);
|
||||
|
||||
// Often but not always null terminated
|
||||
if (stringData[stringLen-1] == 0) {
|
||||
stringLen--;
|
||||
}
|
||||
String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
|
||||
return value;
|
||||
}
|
||||
|
||||
// Stored as UCS2, so 16 bit "unicode"
|
||||
private void get2007and2010Props(
|
||||
InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
|
||||
throws IOException, TikaException, SAXException {
|
||||
// Standard properties
|
||||
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
|
||||
String headerValue = read2007and2010String(stream);
|
||||
handleHeader(i, headerValue, metadata, xhtml);
|
||||
}
|
||||
|
||||
// Custom properties
|
||||
int customCount = skipToCustomProperties(stream);
|
||||
for (int i = 0; i < customCount; i++) {
|
||||
String propName = read2007and2010String(stream);
|
||||
String propValue = read2007and2010String(stream);
|
||||
if(propName.length() > 0 && propValue.length() > 0) {
|
||||
metadata.add(propName, propValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String read2007and2010String(InputStream stream) throws IOException, TikaException {
|
||||
int stringLen = EndianUtils.readUShortLE(stream);
|
||||
|
||||
byte[] stringData = new byte[stringLen * 2];
|
||||
IOUtils.readFully(stream, stringData);
|
||||
String value = StringUtil.getFromUnicodeLE(stringData);
|
||||
|
||||
// Some strings are null terminated
|
||||
if(value.charAt(value.length()-1) == 0) {
|
||||
value = value.substring(0, value.length()-1);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
private void get2000Props(
|
||||
InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
|
||||
throws IOException, TikaException, SAXException {
|
||||
int propCount = 0;
|
||||
while(propCount < 30) {
|
||||
int propIdx = EndianUtils.readUShortLE(stream);
|
||||
int length = EndianUtils.readUShortLE(stream);
|
||||
int valueType = stream.read();
|
||||
|
||||
if(propIdx == 0x28) {
|
||||
// This one seems not to follow the pattern
|
||||
length = 0x19;
|
||||
} else if(propIdx == 90) {
|
||||
// We think this means the end of properties
|
||||
break;
|
||||
}
|
||||
|
||||
byte[] value = new byte[length];
|
||||
IOUtils.readFully(stream, value);
|
||||
if(valueType == 0x1e) {
|
||||
// Normal string, good
|
||||
String val = StringUtil.getFromCompressedUnicode(value, 0, length);
|
||||
|
||||
// Is it one we can look up by index?
|
||||
if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
|
||||
metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
|
||||
xhtml.element("p", val);
|
||||
} else if(propIdx == 0x012c) {
|
||||
int splitAt = val.indexOf('=');
|
||||
if(splitAt > -1) {
|
||||
String propName = val.substring(0, splitAt);
|
||||
String propVal = val.substring(splitAt+1);
|
||||
metadata.add(propName, propVal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No idea...
|
||||
}
|
||||
|
||||
propCount++;
|
||||
}
|
||||
}
|
||||
|
||||
private void handleHeader(
|
||||
int headerNumber, String value, Metadata metadata,
|
||||
XHTMLContentHandler xhtml) throws SAXException {
|
||||
if(value == null || value.length() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
String headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
|
||||
if(headerProp != null) {
|
||||
metadata.set(headerProp, value);
|
||||
}
|
||||
|
||||
xhtml.element("p", value);
|
||||
}
|
||||
|
||||
// Grab the offset, then skip there
|
||||
private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
|
||||
throws IOException, TikaException {
|
||||
// The offset is stored in the header from 0x20 onwards
|
||||
long offsetToSection = EndianUtils.getLongLE(header, 0x20);
|
||||
long toSkip = offsetToSection - header.length;
|
||||
if(offsetToSection == 0){
|
||||
return false;
|
||||
}
|
||||
while (toSkip > 0) {
|
||||
byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
|
||||
IOUtils.readFully(stream, skip);
|
||||
toSkip -= skip.length;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//We think it can be anywhere...
|
||||
private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
|
||||
throws IOException {
|
||||
int val = 0;
|
||||
while(val != -1) {
|
||||
val = stream.read();
|
||||
if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
|
||||
boolean going = true;
|
||||
for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
|
||||
val = stream.read();
|
||||
if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
|
||||
}
|
||||
if(going) {
|
||||
// Bingo, found it
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private int skipToCustomProperties(InputStream stream)
|
||||
throws IOException, TikaException {
|
||||
// There should be 4 zero bytes next
|
||||
byte[] padding = new byte[4];
|
||||
IOUtils.readFully(stream, padding);
|
||||
if(padding[0] == 0 && padding[1] == 0 &&
|
||||
padding[2] == 0 && padding[3] == 0) {
|
||||
// Looks hopeful, skip on
|
||||
padding = new byte[CUSTOM_PROPERTIES_SKIP];
|
||||
IOUtils.readFully(stream, padding);
|
||||
|
||||
// We should now have the count
|
||||
int count = EndianUtils.readUShortLE(stream);
|
||||
|
||||
// Sanity check it
|
||||
if(count > 0 && count < 0x7f) {
|
||||
// Looks plausible
|
||||
return count;
|
||||
} else {
|
||||
// No properties / count is too high to trust
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
// No padding. That probably means no custom props
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(final String[] args) {
|
||||
if (args.length > 0 && args[0].length() > 0) {
|
||||
// file
|
||||
final File dwgFile = new File(args[0]);
|
||||
if(dwgFile.canRead()) {
|
||||
|
||||
System.out.println(dwgFile.getAbsolutePath());
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
// parse
|
||||
final AbstractParser parser = new dwgParser();
|
||||
Document document = null;
|
||||
try {
|
||||
document = Document.mergeDocuments(null, "application/dwg", parser.parse(null, "application/dwg", null, new FileInputStream(dwgFile)));
|
||||
} catch (final Parser.Failure e) {
|
||||
System.err.println("Cannot parse file " + dwgFile.getAbsolutePath());
|
||||
Log.logException(e);
|
||||
} catch (final InterruptedException e) {
|
||||
System.err.println("Interrupted while parsing!");
|
||||
Log.logException(e);
|
||||
} catch (final NoClassDefFoundError e) {
|
||||
System.err.println("class not found: " + e.getMessage());
|
||||
} catch (final FileNotFoundException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
||||
// statistics
|
||||
System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");
|
||||
|
||||
// output
|
||||
if (document == null) {
|
||||
System.out.println("\t!!!Parsing without result!!!");
|
||||
} else {
|
||||
System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
|
||||
try {
|
||||
// write file
|
||||
FileUtils.copy(document.getText(), new File("parsedPdf.txt"));
|
||||
} catch (final IOException e) {
|
||||
System.err.println("error saving parsed document");
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
System.err.println("Cannot read file "+ dwgFile.getAbsolutePath());
|
||||
}
|
||||
} else {
|
||||
System.out.println("Please give a filename as first argument.");
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
Loading…
Reference in new issue