@ -19,24 +19,39 @@
* /
package net.yacy.cora.federate.opensearch ;
import java.io.BufferedInputStream ;
import java.io.ByteArrayInputStream ;
import java.io.FileInputStream ;
import java.io.IOException ;
import java.io.InputStream ;
import java.net.MalformedURLException ;
import java.nio.charset.StandardCharsets ;
import java.util.ArrayList ;
import java.util.Arrays ;
import java.util.List ;
import java.util.Map.Entry ;
import java.util.Properties ;
import org.jsoup.Jsoup ;
import org.jsoup.nodes.Document ;
import org.jsoup.nodes.Element ;
import org.jsoup.select.Elements ;
import net.yacy.cora.document.feed.RSSFeed ;
import net.yacy.cora.document.feed.RSSMessage ;
import net.yacy.cora.document.feed.RSSReader ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
import net.yacy.cora.federate.AbstractFederateSearchConnector ;
import net.yacy.cora.federate.FederateSearchConnector ;
import net.yacy.cora.federate.solr.SchemaDeclaration ;
import net.yacy.cora.federate.solr.SolrType ;
import net.yacy.cora.protocol.ClientIdentification ;
import net.yacy.cora.protocol.Domains ;
import net.yacy.cora.protocol.http.HTTPClient ;
import net.yacy.cora.util.ConcurrentLog ;
import net.yacy.document.TextParser ;
import net.yacy.kelondro.data.meta.URIMetadataNode ;
import net.yacy.search.Switchboard ;
import net.yacy.search.query.QueryParams ;
import net.yacy.search.schema.CollectionSchema ;
@ -45,15 +60,55 @@ import net.yacy.search.schema.CollectionSchema;
* configured systems until number of needed results are available .
* /
public class OpenSearchConnector extends AbstractFederateSearchConnector implements FederateSearchConnector {
/ * *
* HTML mapping properties used to retrieve result from HTML when the results
* are not provided as a standard RSS / Atom feed but as simple HTML .
* /
private Properties htmlMapping ;
/ * *
* @param instanceName open search instance name
* @return the html mapping configuration file name derived from the instance name
* /
public static String htmlMappingFileName ( final String instanceName ) {
return instanceName + ".html.map.properties" ;
}
/ * *
* @param urlTemplate OpenSearch URL template
* /
public OpenSearchConnector ( final String urlTemplate ) {
super ( ) ;
this . baseurl = urlTemplate ;
this . htmlMapping = new Properties ( ) ;
}
@Override
public boolean init ( final String name , final String urltemplate ) {
this . baseurl = urltemplate ;
public boolean init ( final String name , final String cfgFileName ) {
this . instancename = name ;
this . localcfg = null ; // no field mapping needed
this . localcfg = null ;
this . htmlMapping . clear ( ) ;
if ( cfgFileName ! = null & & ! cfgFileName . isEmpty ( ) ) {
BufferedInputStream cfgFileStream = null ;
try {
cfgFileStream = new BufferedInputStream ( new FileInputStream ( cfgFileName ) ) ;
this . htmlMapping . load ( cfgFileStream ) ;
} catch ( IOException e ) {
ConcurrentLog . config ( "OpenSearchConnector." + this . instancename , "Error reading html mapping file : " + cfgFileName , e ) ;
} finally {
if ( cfgFileStream ! = null ) {
try {
cfgFileStream . close ( ) ;
} catch ( IOException e ) {
ConcurrentLog . config ( "OpenSearchConnector." + this . instancename , "Error closing html mapping file : " + cfgFileName , e ) ;
}
}
}
}
return true ;
}
/ * *
* replace Opensearchdescription search template parameter with actual values
* /
@ -68,77 +123,311 @@ public class OpenSearchConnector extends AbstractFederateSearchConnector impleme
return tmps . replace ( "{searchTerms}" , query ) ;
}
/ * *
* @param linkElement html link result node . Must not be null .
* @return and { @link URIMetadataNode } instance from the html link element or null when minimum required information is missing or malformed
* /
protected URIMetadataNode htmlLinkToMetadataNode ( Element linkElement ) {
URIMetadataNode doc = null ;
String absoluteURL = linkElement . absUrl ( "href" ) ;
try {
if ( ! absoluteURL . isEmpty ( ) ) {
DigestURL uri = new DigestURL ( absoluteURL ) ;
doc = new URIMetadataNode ( uri ) ;
if ( linkElement . hasText ( ) & & ! this . htmlMapping . containsKey ( "title" ) ) {
/* Let's use the link text as default title when no mapping is defined.*/
doc . setField ( CollectionSchema . title . getSolrFieldName ( ) , linkElement . text ( ) ) ;
}
String targetLang = linkElement . attr ( "hreflang" ) ;
if ( targetLang ! = null & & ! targetLang . isEmpty ( ) ) {
doc . setField ( CollectionSchema . language_s . getSolrFieldName ( ) , targetLang ) ;
}
final String mime = TextParser . mimeOf ( uri ) ;
if ( mime ! = null ) {
doc . setField ( CollectionSchema . content_type . getSolrFieldName ( ) , mime ) ;
}
/ *
* add collection "dht" which is used to differentiate metadata
* from full crawl data in the index
* /
doc . setField ( CollectionSchema . collection_sxt . getSolrFieldName ( ) , "dht" ) ;
}
} catch ( MalformedURLException e ) {
ConcurrentLog . fine ( "OpenSearchConnector." + this . instancename , "Malformed url : " + absoluteURL ) ;
}
return doc ;
}
/ * *
* Extract results from the HTML result stream , using the html mapping properties .
* Important : it is the responsibility of the caller to close the stream .
* @param resultStream HTML stream containing OpenSearch results . Must not be null .
* @param charsetName characters set name . May be null : in that case the eventual { @code http - equiv } meta tag will be used .
* @return a list of URI nodes , eventually empty .
* @throws IOException when a read / write exception occurred
* /
protected List < URIMetadataNode > parseHTMLResult ( InputStream resultStream , String charsetName ) throws IOException {
List < URIMetadataNode > docs = new ArrayList < > ( ) ;
String resultSelector = this . htmlMapping . getProperty ( "_result" ) ;
String skuSelector = this . htmlMapping . getProperty ( "_sku" ) ;
if ( resultSelector = = null | | skuSelector = = null ) {
ConcurrentLog . warn ( "OpenSearchConnector." + this . instancename , "HTML mapping is incomplete!" ) ;
return docs ;
}
Document jsoupDoc = Jsoup . parse ( resultStream , charsetName , this . baseurl ) ;
Elements results = jsoupDoc . select ( resultSelector ) ;
for ( Element result : results ) {
Elements skuNodes = result . select ( skuSelector ) ;
if ( ! skuNodes . isEmpty ( ) ) {
Element skuNode = skuNodes . first ( ) ;
if ( ! "a" . equals ( skuNode . tagName ( ) ) ) {
/ *
* The selector may refer to a node with link ( s ) inside
* /
Elements links = skuNode . select ( "a[href]" ) ;
if ( ! links . isEmpty ( ) ) {
skuNode = links . first ( ) ;
}
}
if ( skuNode . hasAttr ( "href" ) ) {
URIMetadataNode newDoc = htmlLinkToMetadataNode ( skuNode ) ;
if ( newDoc ! = null ) {
/* Let's handle other field mappings */
htmlResultToFields ( result , newDoc ) ;
docs . add ( newDoc ) ;
}
}
}
}
return docs ;
}
/ * *
* Perform mapping from an HTML result node to YaCy fields using the htmlMapping configuration .
* @param resultNode html single result node
* @param newdoc result document to fill
* /
private void htmlResultToFields ( Element resultNode , URIMetadataNode newdoc ) {
for ( Entry < Object , Object > entry : this . htmlMapping . entrySet ( ) ) {
if ( entry . getKey ( ) instanceof String & & entry . getValue ( ) instanceof String ) {
String yacyFieldName = ( String ) entry . getKey ( ) ;
String selector = ( String ) entry . getValue ( ) ;
if ( ! yacyFieldName . startsWith ( "_" ) ) {
/* If Switchboard environment is set, check the index configuration has this field enabled */
if ( Switchboard . getSwitchboard ( ) = = null | | Switchboard . getSwitchboard ( ) . index = = null
| | Switchboard . getSwitchboard ( ) . index . fulltext ( ) . getDefaultConfiguration ( )
. contains ( yacyFieldName ) ) {
Elements nodes = resultNode . select ( selector ) ;
SchemaDeclaration est ;
try {
est = CollectionSchema . valueOf ( yacyFieldName ) ;
} catch ( IllegalArgumentException e ) {
ConcurrentLog . config ( "OpenSearchConnector." + this . instancename ,
"Ignored " + yacyFieldName + " field mapping : not a field of this schema." ) ;
continue ;
}
if ( est . isMultiValued ( ) ) {
if ( ! nodes . isEmpty ( ) ) {
for ( Element node : nodes ) {
String value = node . text ( ) ;
if ( ! value . isEmpty ( ) ) {
newdoc . addField ( yacyFieldName , value ) ;
}
}
}
} else {
if ( ! nodes . isEmpty ( ) ) {
Element node = nodes . first ( ) ;
String value = node . text ( ) ;
if ( ! value . isEmpty ( ) ) {
/* Perform eventual type conversion */
try {
if ( est . getType ( ) = = SolrType . num_integer ) {
newdoc . setField ( yacyFieldName , Integer . parseInt ( value ) ) ;
} else {
newdoc . setField ( yacyFieldName , value ) ;
}
} catch ( NumberFormatException ex ) {
continue ;
}
}
}
}
}
}
}
}
}
/ * *
* queries remote system and returns the resultlist ( waits until results
* transmitted or timeout ) This is the main access routine used for the
* serach and query operation For internal access delay time , also the
* se ar ch and query operation For internal access delay time , also the
* this . lastaccessed time needs to be set here .
*
* @return query results ( metadata ) with fields according to YaCy schema
* /
@Override
public List < URIMetadataNode > query ( QueryParams query ) {
List < URIMetadataNode > docs = new ArrayList < URIMetadataNode > ( ) ;
return query ( query . getQueryGoal ( ) . getQueryString ( false ) , 0 , query . itemsPerPage ) ;
}
/ * *
* Query the remote system at baseurl with the specified search terms
* @param searchTerms search terms
* @param startIndex index offset
* @param count maximum results number
* @return a result list eventually empty when no results where found or when an error occured
* /
public List < URIMetadataNode > query ( final String searchTerms , final int startIndex , final int count ) {
List < URIMetadataNode > docs = new ArrayList < URIMetadataNode > ( ) ;
// see http://www.loc.gov/standards/sru/
String searchurl = this . parseSearchTemplate ( baseurl , query . getQueryGoal ( ) . getQueryString ( false ) , 0 , query . itemsPerPage ) ;
String searchurl = this . parseSearchTemplate ( baseurl , searchTerms, startIndex , count ) ;
try {
MultiProtocolURL aurl = new MultiProtocolURL ( searchurl ) ;
DigestURL aurl = new Digest URL( searchurl ) ;
try {
this . lastaccesstime = System . currentTimeMillis ( ) ;
final HTTPClient httpClient = new HTTPClient ( ClientIdentification . yacyInternetCrawlerAgent ) ;
byte [ ] result = httpClient . GETbytes ( aurl , null , null , false ) ;
RSSReader rssReader = RSSReader . parse ( RSSFeed . DEFAULT_MAXSIZE , result ) ;
if ( rssReader ! = null ) {
final RSSFeed feed = rssReader . getFeed ( ) ;
if ( feed ! = null ) {
for ( final RSSMessage item : feed ) {
try {
DigestURL uri = new DigestURL ( item . getLink ( ) ) ;
URIMetadataNode doc = new URIMetadataNode ( uri ) ;
doc . setField ( CollectionSchema . charset_s . getSolrFieldName ( ) , StandardCharsets . UTF_8 . name ( ) ) ;
doc . setField ( CollectionSchema . author . getSolrFieldName ( ) , item . getAuthor ( ) ) ;
doc . setField ( CollectionSchema . title . getSolrFieldName ( ) , item . getTitle ( ) ) ;
doc . setField ( CollectionSchema . language_s . getSolrFieldName ( ) , item . getLanguage ( ) ) ;
doc . setField ( CollectionSchema . last_modified . getSolrFieldName ( ) , item . getPubDate ( ) ) ;
final String mime = TextParser . mimeOf ( uri ) ;
if ( mime ! = null ) {
doc . setField ( CollectionSchema . content_type . getSolrFieldName ( ) , mime ) ;
}
if ( item . getCategory ( ) . isEmpty ( ) ) {
doc . setField ( CollectionSchema . keywords . getSolrFieldName ( ) , Arrays . toString ( item . getSubject ( ) ) ) ;
} else {
doc . setField ( CollectionSchema . keywords . getSolrFieldName ( ) , Arrays . toString ( item . getSubject ( ) ) + " " + item . getCategory ( ) ) ;
}
doc . setField ( CollectionSchema . publisher_t . getSolrFieldName ( ) , item . getCopyright ( ) ) ;
if ( result = = null ) {
String details ;
if ( httpClient . getHttpResponse ( ) ! = null & & httpClient . getHttpResponse ( ) . getStatusLine ( ) ! = null ) {
details = " HTTP status code : " + httpClient . getStatusCode ( ) ;
} else {
details = "" ;
}
throw new IOException ( "Could not get a response." + details ) ;
}
doc . setField ( CollectionSchema . text_t . getSolrFieldName ( ) , item . getDescriptions ( ) ) ;
// we likely got only a search related snippet (take is as text content)
// add collection "dht" which is used to differentiate metadata from full crawl data in the index
doc . setField ( CollectionSchema . collection_sxt . getSolrFieldName ( ) , "dht" ) ;
if ( "text/html" . equals ( httpClient . getMimeType ( ) ) ) {
if ( this . htmlMapping . isEmpty ( ) ) {
ConcurrentLog . warn ( "OpenSearchConnector." + this . instancename , "Received HTML result but mapping is not configured!" ) ;
} else {
/ *
* Result was received as html : let ' s try to use the
* provided mapping to retrieve results from HTML
* /
docs = parseHTMLResult ( new ByteArrayInputStream ( result ) , httpClient . getCharacterEncoding ( ) ) ;
}
} else {
/* Other mime types or unknown : let's try to parse the result as RSS or Atom Feed */
RSSReader rssReader = RSSReader . parse ( RSSFeed . DEFAULT_MAXSIZE , result ) ;
if ( rssReader ! = null ) {
final RSSFeed feed = rssReader . getFeed ( ) ;
if ( feed ! = null ) {
for ( final RSSMessage item : feed ) {
try {
DigestURL uri = new DigestURL ( item . getLink ( ) ) ;
if ( item . getLat ( ) ! = 0.0 & & item . getLon ( ) ! = 0.0 ) {
doc . setField ( CollectionSchema . coordinate_p . getSolrFieldName ( ) , item . getLat ( ) + "," + item . getLon ( ) ) ;
}
if ( item . getSize ( ) > 0 ) {
doc . setField ( CollectionSchema . size_i . getSolrFieldName ( ) , item . getSize ( ) ) ;
}
URIMetadataNode doc = new URIMetadataNode ( uri ) ;
doc . setField ( CollectionSchema . charset_s . getSolrFieldName ( ) , StandardCharsets . UTF_8 . name ( ) ) ;
doc . setField ( CollectionSchema . author . getSolrFieldName ( ) , item . getAuthor ( ) ) ;
doc . setField ( CollectionSchema . title . getSolrFieldName ( ) , item . getTitle ( ) ) ;
doc . setField ( CollectionSchema . language_s . getSolrFieldName ( ) , item . getLanguage ( ) ) ;
doc . setField ( CollectionSchema . last_modified . getSolrFieldName ( ) , item . getPubDate ( ) ) ;
final String mime = TextParser . mimeOf ( uri ) ;
if ( mime ! = null ) {
doc . setField ( CollectionSchema . content_type . getSolrFieldName ( ) , mime ) ;
}
if ( item . getCategory ( ) . isEmpty ( ) ) {
doc . setField ( CollectionSchema . keywords . getSolrFieldName ( ) , Arrays . toString ( item . getSubject ( ) ) ) ;
} else {
doc . setField ( CollectionSchema . keywords . getSolrFieldName ( ) , Arrays . toString ( item . getSubject ( ) ) + " " + item . getCategory ( ) ) ;
}
doc . setField ( CollectionSchema . publisher_t . getSolrFieldName ( ) , item . getCopyright ( ) ) ;
docs . add ( doc ) ;
} catch ( final MalformedURLException e ) {
doc . setField ( CollectionSchema . text_t . getSolrFieldName ( ) , item . getDescriptions ( ) ) ;
// we likely got only a search related snippet (take is as text content)
// add collection "dht" which is used to differentiate metadata from full crawl data in the index
doc . setField ( CollectionSchema . collection_sxt . getSolrFieldName ( ) , "dht" ) ;
if ( item . getLat ( ) ! = 0.0 & & item . getLon ( ) ! = 0.0 ) {
doc . setField ( CollectionSchema . coordinate_p . getSolrFieldName ( ) , item . getLat ( ) + "," + item . getLon ( ) ) ;
}
if ( item . getSize ( ) > 0 ) {
doc . setField ( CollectionSchema . size_i . getSolrFieldName ( ) , item . getSize ( ) ) ;
}
docs . add ( doc ) ;
} catch ( final MalformedURLException e ) {
}
}
}
ConcurrentLog . info ( "OpenSerachConnector" , "received " + docs . size ( ) + " results from " + this . instancename ) ;
}
ConcurrentLog . info ( "OpenSearchConnector." + this . instancename , "received " + docs . size ( ) + " results from " + this . instancename ) ;
}
}
}
} catch ( IOException ex ) {
ConcurrentLog . logException ( ex ) ;
ConcurrentLog . info ( "OpenSearchConnector" , "no connection to " + searchurl ) ;
ConcurrentLog . info ( "OpenSearchConnector . " + this . instancename , "no connection to " + searchurl ) ;
}
} catch ( MalformedURLException ee ) {
ConcurrentLog . warn ( "OpenSearchConnector" , "malformed url " + searchurl ) ;
ConcurrentLog . warn ( "OpenSearchConnector . " + this . instancename , "malformed url " + searchurl ) ;
}
return docs ;
}
/ * *
* Main procedure : can be used to test results retrieval from an open search system
* @param args main arguments list :
* < ol >
* < li > OpenSearch URL template ( required ) < / li >
* < li > Search term ( required ) < / li >
* < li > Html mapping file path ( optional ) < / li >
* < / ol >
* /
public static void main ( String args [ ] ) {
try {
if ( args . length < 2 ) {
System . out . println ( "Usage : java " + OpenSearchConnector . class . getCanonicalName ( )
+ " <templateURL> <\"searchTerms\"> [htmlMappingFile]" ) ;
return ;
}
OpenSearchConnector connector = new OpenSearchConnector ( args [ 0 ] ) ;
String htmlMappingFile ;
if ( args . length > 2 ) {
htmlMappingFile = args [ 2 ] ;
} else {
htmlMappingFile = null ;
}
connector . init ( "testConnector" , htmlMappingFile ) ;
String searchTerms = args [ 1 ] ;
if ( searchTerms . length ( ) > 2 & & searchTerms . startsWith ( "\"" ) & & searchTerms . endsWith ( "\"" ) ) {
searchTerms = searchTerms . substring ( 1 , searchTerms . length ( ) - 1 ) ;
}
List < URIMetadataNode > docs = connector . query ( searchTerms , 0 , 20 ) ;
if ( docs . isEmpty ( ) ) {
System . out . println ( "No results" ) ;
} else {
for ( URIMetadataNode doc : docs ) {
System . out . println ( "title : " + doc . getFieldValue ( CollectionSchema . title . getSolrFieldName ( ) ) ) ;
System . out . println ( "sku : " + doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ) ;
System . out . println (
"Description : " + doc . getFieldValue ( CollectionSchema . description_txt . getSolrFieldName ( ) ) + "\n" ) ;
}
}
} finally {
/* Shutdown running threads */
Domains . close ( ) ;
try {
HTTPClient . closeConnectionManager ( ) ;
} catch ( final InterruptedException e ) {
}
ConcurrentLog . shutdown ( ) ;
}
}
}