Added support for HTML OpenSearch results.

Many OpenSearch systems do not provide results as standard RSS/Atom feeds but only as HTML. This modification add some support for custom OpenSearch HTML results through the use of mapping files (as already done for federated Solr search) relying on CSS-like selectors to retrieve information from HTML content. An example mapping file is provided to map results from the www.npmjs.com OpenSearch URL.
8 years ago · bf16de29c1
parent a79194a102
commit bf16de29c1
7 changed files with 475 additions and 74 deletions
--- a/defaults/federatecfg/npmjs.html.map.properties
+++ b/defaults/federatecfg/npmjs.html.map.properties
@ -0,0 +1,24 @@
+# www.npmjs.com HTML search results mapping
+# OpenSearch description : https://www.npmjs.com/opensearch.xml
+# OpenSearch template URL : https://www.npmjs.com/search?q={searchTerms}
+
+# This is an example mapping file for OpenSearch systems or search APIs providing results only as HTML
+# When possible, it is preferable to use an OpenSearch URL providing results as standard RSS or Atom feed as mapping is generic
+# Selectors are using CSS or JQuery-like syntax, as described at https://jsoup.org/apidocs/org/jsoup/select/Selector.html
+# Standard Java properties file syntax is used here instead of usual YaCy Configuration syntax to easily allow '#' characters in values (example : _result=div#result li)
+# Character encoding is assumed to be ISO-8859-1 
+
+# Result node selector (required)
+# In this example, a list item such as : <li class="package-details css-ywvx7i" data-reactid="n">
+_result=.package-details
+
+# Result link selector relative to the selected result block (required)
+# In this example, a link such as <a href="https://www.npmjs.com/package/packageName" class="name css-1nx9rl1">packageName</a>
+_sku=.name
+
+# field mappings
+# YaCyFieldname = HTML text node selector, relative to the result block
+# In this example title is the text of the link so it has the same selector
+title=.name
+# In this example the description is in a paragraph tag such as <p class="description css-zqstoe">Package description</p>
+description_txt=.description
--- a/defaults/heuristicopensearch.conf
+++ b/defaults/heuristicopensearch.conf
@ -12,11 +12,14 @@
 ## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
 ##

+## Additional mapping files for OpenSearch HTML results can be set in DATA/SETTINGS/federatecfg/[name].html.map.properties 
+
 #Faroo-News = http://www.faroo.com/api?q={searchTerms}&start={startIndex}&length=20&l=en&src=news&f=rss  # get results from Faroo news-search
 #WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?}  #Search WordPress.com Blogs
 #Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv
 #Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2
-#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web 
+#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web
+#npmjs = https://www.npmjs.com/search?q={searchTerms} # Search JavaScript packages from the npm repository 

 ## In addition to OpenSearch systems other connectors are available to query foreign systems
 ## the syntax is
--- a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java
+++ b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java
@ -109,9 +109,11 @@ abstract public class AbstractFederateSearchConnector implements FederateSearchC
            @Override
            public void run() {
                Thread.currentThread().setName("heuristic:" + instancename);
+                ConcurrentLog.info("YACY SEARCH (federated)", "Send search query to " +  instancename);
                theSearch.oneFeederStarted();
                List<URIMetadataNode> doclist = query(theSearch.getQuery());
                if (doclist != null) {
+                    ConcurrentLog.info("YACY SEARCH (federated)", "Got " + doclist.size() + " documents from " +  instancename);
                    Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // add nodes doesn't allow null
                    Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(); // add nodes doesn't allow null
                    theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size());
@ -119,6 +121,8 @@ abstract public class AbstractFederateSearchConnector implements FederateSearchC
                    for (URIMetadataNode doc : doclist) {
                        theSearch.addHeuristic(doc.hash(), instancename, false);
                    }
+                } else {
+                	ConcurrentLog.info("YACY SEARCH (federated)", "Got no results from " +  instancename);
                }
                // that's all we need to display serach result
                theSearch.oneFeederTerminated();
--- a/source/net/yacy/cora/federate/FederateSearchManager.java
+++ b/source/net/yacy/cora/federate/FederateSearchManager.java
@ -19,19 +19,22 @@
 */
 package net.yacy.cora.federate;

-import net.yacy.cora.federate.opensearch.OpenSearchConnector;
 import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
-
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
+
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+
 import net.yacy.cora.document.analysis.Classification;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.opensearch.OpenSearchConnector;
 import net.yacy.cora.federate.solr.connector.SolrConnector;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.storage.Configuration;
@ -49,8 +52,6 @@ import net.yacy.search.query.QueryModifier;
 import net.yacy.search.query.QueryParams;
 import net.yacy.search.query.SearchEvent;
 import net.yacy.search.schema.WebgraphSchema;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;

 /**
 * Handling of queries to configured remote OpenSearch systems.
@ -107,8 +108,8 @@ public class FederateSearchManager {
                                ConcurrentLog.config("FederateSearchManager", "Error in configuration of: " + url);
                            }
                        } else { // handle opensearch url template
-                            OpenSearchConnector osc = new OpenSearchConnector();
-                            if (osc.init(name, url)) {
+                            OpenSearchConnector osc = new OpenSearchConnector(url);
+                            if (osc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name))) {
                                conlist.add(osc);
                            }
                        }
@ -234,8 +235,13 @@ public class FederateSearchManager {
                try {
                    conf.commit();
                    if (active) {
-                        OpenSearchConnector osd = new OpenSearchConnector();
-                        if (osd.init(name, urlTemplate)) {
+                        OpenSearchConnector osd = new OpenSearchConnector(urlTemplate);
+                        String htmlMappingFile = null;
+                        Switchboard sb = Switchboard.getSwitchboard();
+                        if(sb != null) {
+                        	htmlMappingFile = sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name);
+                        }
+                        if (osd.init(name, htmlMappingFile)) {
                            conlist.add(osd);
                        }
                    }
@ -407,9 +413,8 @@ public class FederateSearchManager {
                                    ConcurrentLog.config("FederateSearchManager", "Init error in configuration of: " + url);
                                }
                            } else { // handle opensearch url template
-                                OpenSearchConnector osd;
-                                osd = new OpenSearchConnector();
-                                if (osd.init(name, url)) {
+                                OpenSearchConnector osd = new OpenSearchConnector(url);
+                                if (osd.init(name, confFile.getParent()+"/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name))) {
                                    conlist.add(osd);
                                }
                            }
--- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java
+++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java
@ -19,24 +19,39 @@
 */
 package net.yacy.cora.federate.opensearch;

+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Map.Entry;
+import java.util.Properties;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
 import net.yacy.cora.document.feed.RSSFeed;
 import net.yacy.cora.document.feed.RSSMessage;
 import net.yacy.cora.document.feed.RSSReader;
 import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.AbstractFederateSearchConnector;
 import net.yacy.cora.federate.FederateSearchConnector;
+import net.yacy.cora.federate.solr.SchemaDeclaration;
+import net.yacy.cora.federate.solr.SolrType;
 import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.TextParser;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.search.Switchboard;
 import net.yacy.search.query.QueryParams;
 import net.yacy.search.schema.CollectionSchema;

@ -45,15 +60,55 @@ import net.yacy.search.schema.CollectionSchema;
 * configured systems until number of needed results are available.
 */
 public class OpenSearchConnector extends AbstractFederateSearchConnector implements FederateSearchConnector {
-
+	
+	/** 
+	 * HTML mapping properties used to retrieve result from HTML when the results
+	 * are not provided as a standard RSS/Atom feed but as simple HTML.
+	 */
+	private Properties htmlMapping;
+	
+	/**
+	 * @param instanceName open search instance name
+	 * @return the html mapping configuration file name derived from the instance name
+	 */
+	public static String htmlMappingFileName(final String instanceName) {
+		return instanceName + ".html.map.properties";
+	}
+	
+	/**
+	 * @param urlTemplate OpenSearch URL template 
+	 */
+	public OpenSearchConnector(final String urlTemplate) {
+		super();
+		this.baseurl = urlTemplate;
+		this.htmlMapping = new Properties();
+	}
+	
    @Override
-    public boolean init(final String name, final String urltemplate) {
-        this.baseurl = urltemplate;
+    public boolean init(final String name, final String cfgFileName) {
        this.instancename = name;
-        this.localcfg = null; // no field mapping needed
+        this.localcfg = null;
+        this.htmlMapping.clear();
+		if (cfgFileName != null && !cfgFileName.isEmpty()) {
+			BufferedInputStream cfgFileStream = null;
+			try {
+				cfgFileStream = new BufferedInputStream(new FileInputStream(cfgFileName));
+				this.htmlMapping.load(cfgFileStream);
+			} catch (IOException e) {
+				ConcurrentLog.config("OpenSearchConnector." + this.instancename, "Error reading html mapping file : " + cfgFileName, e);
+			} finally {
+				if (cfgFileStream != null) {
+					try {
+						cfgFileStream.close();
+					} catch (IOException e) {
+						ConcurrentLog.config("OpenSearchConnector." + this.instancename, "Error closing html mapping file : " + cfgFileName, e);
+					}
+				}
+			}
+		}
        return true;
    }
-
+    
    /**
     * replace Opensearchdescription search template parameter with actual values
     */
@ -68,77 +123,311 @@ public class OpenSearchConnector extends AbstractFederateSearchConnector impleme
        return tmps.replace("{searchTerms}", query);
    }

+    /**
+     * @param linkElement html link result node. Must not be null.
+     * @return and {@link URIMetadataNode} instance from the html link element or null when minimum required information is missing or malformed
+     */
+	protected URIMetadataNode htmlLinkToMetadataNode(Element linkElement) {
+		URIMetadataNode doc = null;
+		String absoluteURL = linkElement.absUrl("href");
+		try {
+			if (!absoluteURL.isEmpty()) {
+				DigestURL uri = new DigestURL(absoluteURL);
+
+				doc = new URIMetadataNode(uri);
+				
+				if(linkElement.hasText() && !this.htmlMapping.containsKey("title")) {
+					/* Let's use the link text as default title when no mapping is defined.*/
+					doc.setField(CollectionSchema.title.getSolrFieldName(), linkElement.text());
+				}
+				
+				String targetLang = linkElement.attr("hreflang");
+				if(targetLang != null && !targetLang.isEmpty()) {
+					doc.setField(CollectionSchema.language_s.getSolrFieldName(), targetLang);
+				}
+				
+				final String mime = TextParser.mimeOf(uri);
+				if (mime != null) {
+					doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime);
+				}
+				
+				/*
+				 * add collection "dht" which is used to differentiate metadata
+				 * from full crawl data in the index
+				 */
+				doc.setField(CollectionSchema.collection_sxt.getSolrFieldName(), "dht");
+			}
+		} catch (MalformedURLException e) {
+			ConcurrentLog.fine("OpenSearchConnector." + this.instancename, "Malformed url : " + absoluteURL);
+		}
+		return doc;
+	}
+    
+	/**
+	 * Extract results from the HTML result stream, using the html mapping properties.
+	 * Important : it is the responsibility of the caller to close the stream.
+	 * @param resultStream HTML stream containing OpenSearch results. Must not be null.
+	 * @param charsetName characters set name. May be null : in that case the eventual {@code http-equiv} meta tag will be used.
+	 * @return a list of URI nodes, eventually empty.
+	 * @throws IOException when a read/write exception occurred
+	 */
+	protected List<URIMetadataNode> parseHTMLResult(InputStream resultStream, String charsetName) throws IOException {
+		List<URIMetadataNode> docs = new ArrayList<>();
+		String resultSelector = this.htmlMapping.getProperty("_result");
+		String skuSelector = this.htmlMapping.getProperty("_sku");
+		if (resultSelector == null || skuSelector == null) {
+			ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "HTML mapping is incomplete!");
+			return docs;
+		}
+
+		Document jsoupDoc = Jsoup.parse(resultStream, charsetName, this.baseurl);
+		Elements results = jsoupDoc.select(resultSelector);
+
+		for (Element result : results) {
+			Elements skuNodes = result.select(skuSelector);
+			if (!skuNodes.isEmpty()) {
+				Element skuNode = skuNodes.first();
+				if (!"a".equals(skuNode.tagName())) {
+					/*
+					 * The selector may refer to a node with link(s) inside
+					 */
+					Elements links = skuNode.select("a[href]");
+					if (!links.isEmpty()) {
+						skuNode = links.first();
+					}
+				}
+				if (skuNode.hasAttr("href")) {
+					URIMetadataNode newDoc = htmlLinkToMetadataNode(skuNode);
+					if (newDoc != null) {
+						/* Let's handle other field mappings */
+						htmlResultToFields(result, newDoc);
+						docs.add(newDoc);
+					}
+				}
+			}
+		}
+		return docs;
+    }
+
+	/**
+	 * Perform mapping from an HTML result node to YaCy fields using the htmlMapping configuration.
+	 * @param resultNode html single result node
+	 * @param newdoc result document to fill
+	 */
+	private void htmlResultToFields(Element resultNode, URIMetadataNode newdoc) {
+		for (Entry<Object, Object> entry : this.htmlMapping.entrySet()) {
+			if (entry.getKey() instanceof String && entry.getValue() instanceof String) {
+				String yacyFieldName = (String) entry.getKey();
+				String selector = (String) entry.getValue();
+				
+				if (!yacyFieldName.startsWith("_")) {
+					/* If Switchboard environment is set, check the index configuration has this field enabled */
+					if (Switchboard.getSwitchboard() == null || Switchboard.getSwitchboard().index == null
+							|| Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration()
+									.contains(yacyFieldName)) {
+
+						Elements nodes = resultNode.select(selector);
+
+						SchemaDeclaration est;
+						try {
+							est = CollectionSchema.valueOf(yacyFieldName);
+						} catch(IllegalArgumentException e) {
+							ConcurrentLog.config("OpenSearchConnector." + this.instancename,
+									"Ignored " + yacyFieldName + " field mapping : not a field of this schema.");
+							continue;
+						}
+						if (est.isMultiValued()) {
+							if (!nodes.isEmpty()) {
+								for (Element node : nodes) {
+									String value = node.text();
+									if (!value.isEmpty()) {
+										newdoc.addField(yacyFieldName, value);
+									}
+								}
+							}
+						} else {
+							if (!nodes.isEmpty()) {
+								Element node = nodes.first();
+								String value = node.text();
+								if (!value.isEmpty()) {
+									/* Perform eventual type conversion */
+									try {
+										if (est.getType() == SolrType.num_integer) {
+											newdoc.setField(yacyFieldName, Integer.parseInt(value));
+										} else {
+											newdoc.setField(yacyFieldName, value);
+										}
+									} catch (NumberFormatException ex) {
+										continue;
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
    /**
     * queries remote system and returns the resultlist (waits until results
     * transmitted or timeout) This is the main access routine used for the
-     * serach and query operation For internal access delay time, also the
+     * search and query operation For internal access delay time, also the
     * this.lastaccessed time needs to be set here.
     *
     * @return query results (metadata) with fields according to YaCy schema
     */
    @Override
    public List<URIMetadataNode> query(QueryParams query) {
-        List<URIMetadataNode> docs = new ArrayList<URIMetadataNode>();

+        return query(query.getQueryGoal().getQueryString(false), 0, query.itemsPerPage);
+    }
+    
+    /**
+     * Query the remote system at baseurl with the specified search terms
+     * @param searchTerms search terms
+     * @param startIndex index offset
+     * @param count maximum results number
+     * @return a result list eventually empty when no results where found or when an error occured
+     */
+    public List<URIMetadataNode> query(final String searchTerms, final int startIndex, final int count) {
+    	List<URIMetadataNode> docs = new ArrayList<URIMetadataNode>();
+    	
        // see http://www.loc.gov/standards/sru/
-        String searchurl = this.parseSearchTemplate(baseurl, query.getQueryGoal().getQueryString(false), 0, query.itemsPerPage);
+        String searchurl = this.parseSearchTemplate(baseurl, searchTerms, startIndex, count);
        try {
-            MultiProtocolURL aurl = new MultiProtocolURL(searchurl);
+        	DigestURL aurl = new DigestURL(searchurl);
            try {
                this.lastaccesstime = System.currentTimeMillis();
+                
                final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
                byte[] result = httpClient.GETbytes(aurl, null, null, false);
-                RSSReader rssReader =  RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
-                if (rssReader != null) {
-                    final RSSFeed feed = rssReader.getFeed();
-                    if (feed != null) {
-                        for (final RSSMessage item : feed) {
-                            try {
-                                DigestURL uri = new DigestURL(item.getLink());
-
-                                URIMetadataNode doc = new URIMetadataNode(uri);
-                                doc.setField(CollectionSchema.charset_s.getSolrFieldName(), StandardCharsets.UTF_8.name());
-                                doc.setField(CollectionSchema.author.getSolrFieldName(), item.getAuthor());
-                                doc.setField(CollectionSchema.title.getSolrFieldName(), item.getTitle());
-                                doc.setField(CollectionSchema.language_s.getSolrFieldName(), item.getLanguage());
-                                doc.setField(CollectionSchema.last_modified.getSolrFieldName(), item.getPubDate());
-                                final String mime = TextParser.mimeOf(uri);
-                                if (mime != null) {
-                                    doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime);
-                                }
-                                if (item.getCategory().isEmpty()) {
-                                    doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()));
-                                } else {
-                                    doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()) + " " + item.getCategory());
-                                }
-                                doc.setField(CollectionSchema.publisher_t.getSolrFieldName(), item.getCopyright());
+                
+    			if(result == null) {
+    				String details;
+    				if(httpClient.getHttpResponse() != null && httpClient.getHttpResponse().getStatusLine() != null) {
+    					details = " HTTP status code : " + httpClient.getStatusCode();
+    				} else {
+    					details = "";
+    				}
+                	throw new IOException("Could not get a response." + details);
+    			}

-                                doc.setField(CollectionSchema.text_t.getSolrFieldName(), item.getDescriptions());
-                                // we likely got only a search related snippet (take is as text content)
-                                // add collection "dht" which is used to differentiate metadata from full crawl data in the index
-                                doc.setField(CollectionSchema.collection_sxt.getSolrFieldName(), "dht");
+                if("text/html".equals(httpClient.getMimeType())) {
+					if (this.htmlMapping.isEmpty()) {
+						ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "Received HTML result but mapping is not configured!");
+					} else {
+						/*
+						 * Result was received as html : let's try to use the
+						 * provided mapping to retrieve results from HTML
+						 */
+						docs = parseHTMLResult(new ByteArrayInputStream(result), httpClient.getCharacterEncoding());
+					}
+                } else {
+                	/* Other mime types or unknown : let's try to parse the result as RSS or Atom Feed */
+                    RSSReader rssReader =  RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
+                    if (rssReader != null) {
+                        final RSSFeed feed = rssReader.getFeed();
+                        if (feed != null) {
+                            for (final RSSMessage item : feed) {
+                                try {
+                                    DigestURL uri = new DigestURL(item.getLink());

-                                if (item.getLat() != 0.0 && item.getLon() != 0.0) {
-                                    doc.setField(CollectionSchema.coordinate_p.getSolrFieldName(), item.getLat() + "," + item.getLon());
-                                }
-                                if (item.getSize() > 0) {
-                                    doc.setField(CollectionSchema.size_i.getSolrFieldName(), item.getSize());
-                                }
+                                    URIMetadataNode doc = new URIMetadataNode(uri);
+                                    doc.setField(CollectionSchema.charset_s.getSolrFieldName(), StandardCharsets.UTF_8.name());
+                                    doc.setField(CollectionSchema.author.getSolrFieldName(), item.getAuthor());
+                                    doc.setField(CollectionSchema.title.getSolrFieldName(), item.getTitle());
+                                    doc.setField(CollectionSchema.language_s.getSolrFieldName(), item.getLanguage());
+                                    doc.setField(CollectionSchema.last_modified.getSolrFieldName(), item.getPubDate());
+                                    final String mime = TextParser.mimeOf(uri);
+                                    if (mime != null) {
+                                        doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime);
+                                    }
+                                    if (item.getCategory().isEmpty()) {
+                                        doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()));
+                                    } else {
+                                        doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()) + " " + item.getCategory());
+                                    }
+                                    doc.setField(CollectionSchema.publisher_t.getSolrFieldName(), item.getCopyright());

-                                docs.add(doc);
-                            } catch (final MalformedURLException e) {
+                                    doc.setField(CollectionSchema.text_t.getSolrFieldName(), item.getDescriptions());
+                                    // we likely got only a search related snippet (take is as text content)
+                                    // add collection "dht" which is used to differentiate metadata from full crawl data in the index
+                                    doc.setField(CollectionSchema.collection_sxt.getSolrFieldName(), "dht");
+
+                                    if (item.getLat() != 0.0 && item.getLon() != 0.0) {
+                                        doc.setField(CollectionSchema.coordinate_p.getSolrFieldName(), item.getLat() + "," + item.getLon());
+                                    }
+                                    if (item.getSize() > 0) {
+                                        doc.setField(CollectionSchema.size_i.getSolrFieldName(), item.getSize());
+                                    }
+
+                                    docs.add(doc);
+                                } catch (final MalformedURLException e) {
+                                }
                            }
-                        }
-                        ConcurrentLog.info("OpenSerachConnector", "received " + docs.size() + " results from " + this.instancename);
-                    }
+                			ConcurrentLog.info("OpenSearchConnector." + this.instancename, "received " + docs.size() + " results from " + this.instancename);
+                		}
+                	}
                }
            } catch (IOException ex) {
                ConcurrentLog.logException(ex);
-                ConcurrentLog.info("OpenSearchConnector", "no connection to " + searchurl);
+                ConcurrentLog.info("OpenSearchConnector." + this.instancename, "no connection to " + searchurl);
            }
        } catch (MalformedURLException ee) {
-            ConcurrentLog.warn("OpenSearchConnector", "malformed url " + searchurl);
+            ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "malformed url " + searchurl);
        }
        return docs;
    }
+    
+    /**
+     * Main procedure : can be used to test results retrieval from an open search system
+     * @param args main arguments list:
+     * <ol>
+     * 	<li>OpenSearch URL template (required)</li>
+     * 	<li>Search term (required)</li>
+     * 	<li>Html mapping file path (optional)</li>
+     * </ol>
+     */
+	public static void main(String args[]) {
+		try {
+			if (args.length < 2) {
+				System.out.println("Usage : java " + OpenSearchConnector.class.getCanonicalName()
+						+ " <templateURL> <\"searchTerms\"> [htmlMappingFile]");
+				return;
+			}
+			OpenSearchConnector connector = new OpenSearchConnector(args[0]);
+			String htmlMappingFile;
+			if (args.length > 2) {
+				htmlMappingFile = args[2];
+			} else {
+				htmlMappingFile = null;
+			}
+			connector.init("testConnector", htmlMappingFile);
+			String searchTerms = args[1];
+			if(searchTerms.length() > 2 && searchTerms.startsWith("\"") && searchTerms.endsWith("\"")) {
+				searchTerms = searchTerms.substring(1, searchTerms.length() - 1);
+			}
+			List<URIMetadataNode> docs = connector.query(searchTerms, 0, 20);
+			if (docs.isEmpty()) {
+				System.out.println("No results");
+			} else {
+
+				for (URIMetadataNode doc : docs) {
+					System.out.println("title : " + doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
+					System.out.println("sku : " + doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
+					System.out.println(
+							"Description : " + doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()) + "\n");
+				}
+			}
+		} finally {
+			/* Shutdown running threads */
+			Domains.close();
+			try {
+				HTTPClient.closeConnectionManager();
+			} catch (final InterruptedException e) {
+			}
+			ConcurrentLog.shutdown();
+		}
+	}
 }
--- a/source/net/yacy/cora/protocol/Domains.java
+++ b/source/net/yacy/cora/protocol/Domains.java
@ -48,6 +48,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.regex.Pattern;
@ -728,6 +729,9 @@ public class Domains {

    public static synchronized void close() {
        if (globalHosts != null) try {globalHosts.close();} catch (final IOException e) {log.warn(e);}
+        if(getByNameService != null) {
+        	getByNameService.shutdownNow();
+        }
    }

    /**
@ -795,9 +799,11 @@ public class Domains {
        NAME_CACHE_HIT.insertIfAbsent(host, i);
        cacheHit_Insert++;
    }
+    
+	final private static ExecutorService getByNameService = Executors
+			.newCachedThreadPool(new NamePrefixThreadFactory("InetAddress.getByName"));

-	final private static TimeLimiter timeLimiter = new SimpleTimeLimiter(
-			Executors.newCachedThreadPool(new NamePrefixThreadFactory("InetAddress.getByName")));
+	final private static TimeLimiter timeLimiter = new SimpleTimeLimiter(getByNameService);

    /**
     * strip off any parts of an url, address string (containing host/ip:port) or raw IPs/Hosts,
--- a/source/net/yacy/cora/protocol/http/HTTPClient.java
+++ b/source/net/yacy/cora/protocol/http/HTTPClient.java
@ -48,18 +48,10 @@ import javax.net.ssl.SSLContext;
 import javax.net.ssl.TrustManager;
 import javax.net.ssl.X509TrustManager;

-import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.MultiProtocolURL;
-import net.yacy.cora.protocol.ClientIdentification;
-import net.yacy.cora.protocol.ConnectionInfo;
-import net.yacy.cora.protocol.Domains;
-import net.yacy.cora.protocol.HeaderFramework;
-import net.yacy.cora.util.Memory;
-import net.yacy.kelondro.util.NamePrefixThreadFactory;
-
 import org.apache.http.Header;
 import org.apache.http.HttpEntity;
 import org.apache.http.HttpEntityEnclosingRequest;
+import org.apache.http.HttpHeaders;
 import org.apache.http.HttpHost;
 import org.apache.http.HttpResponse;
 import org.apache.http.auth.AuthScope;
@ -98,6 +90,16 @@ import org.apache.http.protocol.HttpContext;
 import org.apache.http.util.ByteArrayBuffer;
 import org.apache.http.util.EntityUtils;

+import net.yacy.cora.document.encoding.UTF8;
+import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.ConnectionInfo;
+import net.yacy.cora.protocol.Domains;
+import net.yacy.cora.protocol.HeaderFramework;
+import net.yacy.cora.util.CommonPattern;
+import net.yacy.cora.util.Memory;
+import net.yacy.kelondro.util.NamePrefixThreadFactory;
+

 /**
 * HttpClient implementation which uses <a href="http://hc.apache.org/">HttpComponents Client</a>.
@ -560,6 +562,74 @@ public class HTTPClient {
    public int getStatusCode() {
 	    return this.httpResponse.getStatusLine().getStatusCode();
 	}
+    
+    /**
+     * Get Mime type from the response header
+     * @return mime type (trimmed and lower cased) or null when not specified
+     */
+	public String getMimeType() {
+		String mimeType = null;
+		if (this.httpResponse != null) {
+
+			Header contentType = this.httpResponse.getFirstHeader(HttpHeaders.CONTENT_TYPE);
+
+			if (contentType != null) {
+
+				mimeType = contentType.getValue();
+
+				if (mimeType != null) {
+					mimeType = mimeType.trim().toLowerCase();
+
+					final int pos = mimeType.indexOf(';');
+					if(pos >= 0) {
+						mimeType = mimeType.substring(0, pos);
+					}
+				}
+			}
+		}
+		return mimeType;
+	}
+	
+	/**
+	 * Get character encoding from the response header
+	 * 
+	 * @return the characters set name or null when not specified
+	 */
+	public String getCharacterEncoding() {
+		String charsetName = null;
+		if (this.httpResponse != null) {
+
+			Header contentTypeHeader = this.httpResponse.getFirstHeader(HttpHeaders.CONTENT_TYPE);
+
+			if (contentTypeHeader != null) {
+
+				String contentType = contentTypeHeader.getValue();
+
+				if (contentType != null) {
+
+					final String[] parts = CommonPattern.SEMICOLON.split(contentType);
+					if (parts != null && parts.length > 1) {
+
+						for (int i = 1; i < parts.length; i++) {
+							final String param = parts[i].trim();
+							if (param.startsWith("charset=")) {
+								String charset = param.substring("charset=".length()).trim();
+								if (charset.length() > 0 && (charset.charAt(0) == '\"' || charset.charAt(0) == '\'')) {
+									charset = charset.substring(1);
+								}
+								if (charset.endsWith("\"") || charset.endsWith("'")) {
+									charset = charset.substring(0, charset.length() - 1);
+								}
+								charsetName = charset.trim();
+							}
+						}
+					}
+				}
+			}
+		}
+
+		return charsetName;
+	}

    /**
     * This method gets direct access to the content-stream