git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7426 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
f5baf53391
commit
54e77e6255
@ -1,313 +0,0 @@
|
||||
/**
|
||||
* Search
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 25.05.2010 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.cora.protocol.http.HTTPConnector;
|
||||
import net.yacy.cora.protocol.http.LinkExtractor;
|
||||
import net.yacy.cora.storage.ScoreMap;
|
||||
|
||||
import org.apache.http.entity.mime.content.ContentBody;
|
||||
import org.apache.http.entity.mime.content.StringBody;
|
||||
|
||||
public class Search extends Thread {
|
||||
|
||||
private final static int recordsPerSession = 10;
|
||||
|
||||
public static final String[] SRURSSServicesList = {
|
||||
"http://yacy.dyndns.org:8000/yacysearch.rss",
|
||||
"http://yacy.caloulinux.net:8085/yacysearch.rss",
|
||||
"http://algire.dyndns.org:8085/yacysearch.rss",
|
||||
"http://breyvogel.dyndns.org:8002/yacysearch.rss"
|
||||
};
|
||||
|
||||
public static final String[] genericServicesList = {
|
||||
"http://www.scroogle.org/cgi-bin/nbbw.cgi?Gw=$&n=2",
|
||||
"http://blekko.com/ws/$+/rss",
|
||||
"http://www.bing.com/search?q=$&format=rss",
|
||||
"http://search.twitter.com/search.atom?q=$"
|
||||
};
|
||||
|
||||
public static Thread accumulateSRURSS(
|
||||
final String urlBase,
|
||||
final String query,
|
||||
final long timeoutInit,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global,
|
||||
final Map<MultiProtocolURI, List<Integer>> result) {
|
||||
Thread t = new Thread() {
|
||||
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
|
||||
public void run() {
|
||||
searchSRURSS(urlBase, query, timeoutInit, maximumRecordsInit, verify, global, results);
|
||||
int p = 1;
|
||||
RSSMessage message;
|
||||
try {
|
||||
while ((message = results.poll(timeoutInit, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) {
|
||||
MultiProtocolURI uri;
|
||||
if (message == null) break;
|
||||
try {
|
||||
uri = new MultiProtocolURI(message.getLink());
|
||||
List<Integer> m = result.get(uri);
|
||||
if (m == null) m = new ArrayList<Integer>();
|
||||
m.add(new Integer(p++));
|
||||
result.put(uri, m);
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
return t;
|
||||
}
|
||||
|
||||
public static Thread searchSRURSS(
|
||||
final String urlBase,
|
||||
final String query,
|
||||
final long timeoutInit,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global,
|
||||
final BlockingQueue<RSSMessage> queue) {
|
||||
Thread job = new Thread() {
|
||||
public void run() {
|
||||
int startRecord = 0;
|
||||
RSSMessage message;
|
||||
int maximumRecords = maximumRecordsInit;
|
||||
long timeout = timeoutInit;
|
||||
mainloop: while (timeout > 0 && maximumRecords > 0) {
|
||||
long st = System.currentTimeMillis();
|
||||
RSSFeed feed;
|
||||
try {
|
||||
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global);
|
||||
} catch (IOException e1) {
|
||||
break mainloop;
|
||||
}
|
||||
if (feed == null || feed.isEmpty()) break mainloop;
|
||||
maximumRecords -= feed.size();
|
||||
innerloop: while (!feed.isEmpty()) {
|
||||
message = feed.pollMessage();
|
||||
if (message == null) break innerloop;
|
||||
try {
|
||||
queue.put(message);
|
||||
} catch (InterruptedException e) {
|
||||
break innerloop;
|
||||
}
|
||||
}
|
||||
startRecord += recordsPerSession;
|
||||
timeout -= System.currentTimeMillis() - st;
|
||||
}
|
||||
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
|
||||
}
|
||||
};
|
||||
job.start();
|
||||
return job;
|
||||
}
|
||||
|
||||
/**
|
||||
* send a query to a yacy public search interface
|
||||
* @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
|
||||
* @param query the query as string
|
||||
* @param startRecord number of first record
|
||||
* @param maximumRecords maximum number of records
|
||||
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
|
||||
* @param global if true also search results from other peers are included
|
||||
* @param timeout milliseconds that are waited at maximum for a search result
|
||||
* @return
|
||||
*/
|
||||
public static RSSFeed loadSRURSS(
|
||||
String rssSearchServiceURL,
|
||||
String query,
|
||||
long timeout,
|
||||
int startRecord,
|
||||
int maximumRecords,
|
||||
boolean verify,
|
||||
boolean global) throws IOException {
|
||||
MultiProtocolURI uri = null;
|
||||
try {
|
||||
uri = new MultiProtocolURI(rssSearchServiceURL);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
|
||||
}
|
||||
|
||||
// send request
|
||||
try {
|
||||
final LinkedHashMap<String,ContentBody> parts = new LinkedHashMap<String,ContentBody>();
|
||||
parts.put("query", new StringBody(query));
|
||||
parts.put("startRecord", new StringBody(Integer.toString(startRecord)));
|
||||
parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords)));
|
||||
parts.put("verify", new StringBody(verify ? "true" : "false"));
|
||||
parts.put("resource", new StringBody(global ? "global" : "local"));
|
||||
final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
|
||||
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
|
||||
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
|
||||
if (reader == null) {
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
|
||||
}
|
||||
final RSSFeed feed = reader.getFeed();
|
||||
if (feed == null) {
|
||||
// case where the rss reader does not understand the content
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
|
||||
}
|
||||
return feed;
|
||||
} catch (final IOException e) {
|
||||
throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
public static Thread accumulateGeneric(
|
||||
String query,
|
||||
String service,
|
||||
final Map<MultiProtocolURI, List<Integer>> result,
|
||||
final int timeout) {
|
||||
query = query.replace(' ', '+');
|
||||
final String servicePatched = service.replaceAll("\\$", query);
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
try {
|
||||
MultiProtocolURI[] sr = loadGeneric(new MultiProtocolURI(servicePatched), timeout);
|
||||
int p = 1;
|
||||
for (MultiProtocolURI u: sr) {
|
||||
List<Integer> m = result.get(u);
|
||||
if (m == null) m = new ArrayList<Integer>();
|
||||
m.add(new Integer(p++));
|
||||
result.put(u, m);
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
return t;
|
||||
}
|
||||
|
||||
private static MultiProtocolURI[] loadGeneric(MultiProtocolURI uri, long timeout) throws IOException {
|
||||
final RequestHeader requestHeader = new RequestHeader();
|
||||
requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent);
|
||||
final HTTPClient client = new HTTPClient();
|
||||
client.setTimout((int) timeout);
|
||||
client.setHeader(requestHeader.entrySet());
|
||||
byte[] result = client.GETbytes(uri.toString());
|
||||
client.finish();
|
||||
if (client.getStatusCode() != 200) {
|
||||
throw new IOException("Server returned status: " + client.getHttpResponse().getStatusLine());
|
||||
}
|
||||
if (result == null) throw new IOException("cora.Search error asking peer '" + uri.getHost() + "': null");
|
||||
LinkExtractor le = new LinkExtractor(Pattern.compile(".*" + uri.getHost() + ".*"));
|
||||
le.scrape(new String(result));
|
||||
MultiProtocolURI[] links = le.getLinks();
|
||||
return links;
|
||||
}
|
||||
|
||||
public static RSSFeed links2feed(Set<MultiProtocolURI> links, String source) {
|
||||
RSSFeed feed = new RSSFeed(Integer.MAX_VALUE);
|
||||
String u;
|
||||
RSSMessage message;
|
||||
for (MultiProtocolURI uri: links) {
|
||||
u = uri.toNormalform(true, false);
|
||||
message = new RSSMessage(u, "", u);
|
||||
message.setAuthor(source);
|
||||
feed.addMessage(message);
|
||||
}
|
||||
return feed;
|
||||
}
|
||||
|
||||
private Map<MultiProtocolURI, List<Integer>> result;
|
||||
private String query;
|
||||
private int count;
|
||||
private String[] yacyServices, rssServices, genericServices;
|
||||
private List<Thread> threads;
|
||||
|
||||
public Search(String query, int count, String[] rssServices, String[] genericServices) {
|
||||
this.result = new ConcurrentHashMap<MultiProtocolURI, List<Integer>>();
|
||||
this.query = query;
|
||||
this.count = count;
|
||||
this.yacyServices = yacyServices;
|
||||
this.rssServices = rssServices;
|
||||
this.genericServices = genericServices;
|
||||
this.threads = new ArrayList<Thread>();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
for (String service: this.rssServices) threads.add(accumulateSRURSS(service, this.query, 10000, this.count, false, true, this.result));
|
||||
for (String service: this.genericServices) threads.add(accumulateGeneric(this.query, service, this.result, 10000));
|
||||
}
|
||||
|
||||
public ScoreMap<MultiProtocolURI> getResults() {
|
||||
ScoreMap<MultiProtocolURI> scores = new ScoreMap<MultiProtocolURI>();
|
||||
int m = this.rssServices.length + this.genericServices.length;
|
||||
for (Map.Entry<MultiProtocolURI, List<Integer>> entry: this.result.entrySet()) {
|
||||
int a = 0;
|
||||
for (Integer i : entry.getValue()) a += i.intValue();
|
||||
scores.inc(entry.getKey(), a * m / entry.getValue().size());
|
||||
}
|
||||
return scores;
|
||||
}
|
||||
|
||||
public void waitTermination() {
|
||||
for (Thread t: threads) try {t.join();} catch (InterruptedException e) {}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String s: args) sb.append(s).append(' ');
|
||||
String query = sb.toString().trim();
|
||||
Search search = new Search(query, 100, SRURSSServicesList, genericServicesList);
|
||||
search.start();
|
||||
try {Thread.sleep(100);} catch (InterruptedException e1) {}
|
||||
search.waitTermination();
|
||||
ScoreMap<MultiProtocolURI> result = search.getResults();
|
||||
Iterator<MultiProtocolURI> i = result.keys(true);
|
||||
MultiProtocolURI u;
|
||||
while (i.hasNext()) {
|
||||
u = i.next();
|
||||
System.out.println("[" + result.get(u) + "] " + u.toNormalform(true, false));
|
||||
}
|
||||
try {HTTPClient.closeConnectionManager();} catch (InterruptedException e) {}
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
/**
|
||||
* Accumulator
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 07.01.2011 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
/**
|
||||
* place-holder class to provide a object declaration for threads in Search object
|
||||
*/
|
||||
public interface SearchAccumulator extends Runnable {
|
||||
|
||||
/**
|
||||
* join this accumulator: wait until it terminates
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
public void join() throws InterruptedException;
|
||||
|
||||
/**
|
||||
* test if the accumulator is still running
|
||||
* @return
|
||||
*/
|
||||
public boolean isAlive();
|
||||
|
||||
}
|
@ -0,0 +1,164 @@
|
||||
/**
|
||||
* Search
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 25.05.2010 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General private
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General private License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General private License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.cora.storage.ScoreMap;
|
||||
|
||||
public class SearchHub {
|
||||
|
||||
private static final String[] SRURSSServicesList = {
|
||||
"http://yacy.dyndns.org:8000/yacysearch.rss",
|
||||
"http://yacy.caloulinux.net:8085/yacysearch.rss",
|
||||
"http://algire.dyndns.org:8085/yacysearch.rss",
|
||||
"http://breyvogel.dyndns.org:8002/yacysearch.rss"
|
||||
};
|
||||
|
||||
public final static SearchHub EMPTY = new SearchHub("", 0);
|
||||
|
||||
private String query;
|
||||
private int timeout;
|
||||
private List<SearchAccumulator> threads;
|
||||
private Map<RSSMessage, List<Integer>> result;
|
||||
|
||||
public SearchHub(final String query, final int timeout) {
|
||||
this.query = query;
|
||||
this.timeout = timeout;
|
||||
this.threads = new ArrayList<SearchAccumulator>();
|
||||
this.result = new ConcurrentHashMap<RSSMessage, List<Integer>>();
|
||||
}
|
||||
|
||||
/**
|
||||
* get the result of the accumulation
|
||||
* @return
|
||||
*/
|
||||
public Map<RSSMessage, List<Integer>> getAccumulation() {
|
||||
return this.result;
|
||||
}
|
||||
|
||||
/**
|
||||
* add an accumulator to the list of accumulation theads.
|
||||
* this is mainly used for awaitTermination() and isTerminated()
|
||||
* @param a
|
||||
*/
|
||||
public void addAccumulator(SearchAccumulator a) {
|
||||
this.threads.add(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the original query string
|
||||
* @return
|
||||
*/
|
||||
public String getQuery() {
|
||||
return this.query;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the given time-out of the search request
|
||||
* @return
|
||||
*/
|
||||
public int getTimeout() {
|
||||
return this.timeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the list of search results as scored map.
|
||||
* The results are combined using their appearance positions.
|
||||
* Every time this method is called the list is re-computed to reflect the latest results
|
||||
* @return a score map of urls
|
||||
*/
|
||||
public ScoreMap<String> getResults() {
|
||||
ScoreMap<String> scores = new ScoreMap<String>();
|
||||
int m = threads.size();
|
||||
for (Map.Entry<RSSMessage, List<Integer>> entry: this.result.entrySet()) {
|
||||
int a = 0;
|
||||
for (Integer i : entry.getValue()) a += i.intValue();
|
||||
scores.inc(entry.getKey().getLink(), a * m / entry.getValue().size());
|
||||
}
|
||||
return scores;
|
||||
}
|
||||
|
||||
/**
|
||||
* wait until all accumulation threads have terminated
|
||||
*/
|
||||
public void waitTermination() {
|
||||
for (SearchAccumulator t: threads) try {t.join();} catch (InterruptedException e) {}
|
||||
}
|
||||
|
||||
/**
|
||||
* return true if all accumulation threads have terminated
|
||||
* @return
|
||||
*/
|
||||
public boolean isTerminated() {
|
||||
for (SearchAccumulator t: threads) if (t.isAlive()) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* return a hash code of the search hub.
|
||||
* This is computed using only the query string because that identifies the object
|
||||
*/
|
||||
public int hashCode() {
|
||||
return query.hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* test method to add a list of SRU RSS services.
|
||||
* such services are provided by YaCy peers
|
||||
* @param search
|
||||
* @param rssServices
|
||||
* @param count
|
||||
* @param verify
|
||||
* @param global
|
||||
*/
|
||||
public static void addSRURSSServices(SearchHub search, String[] rssServices, int count, boolean verify, boolean global) {
|
||||
for (String service: rssServices) {
|
||||
SearchSRURSS accumulator = new SearchSRURSS(search, service, count, verify, global);
|
||||
accumulator.start();
|
||||
search.addAccumulator(accumulator);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String s: args) sb.append(s).append(' ');
|
||||
String query = sb.toString().trim();
|
||||
SearchHub search = new SearchHub(query, 10000);
|
||||
addSRURSSServices(search, SRURSSServicesList, 100, false, false);
|
||||
try {Thread.sleep(100);} catch (InterruptedException e1) {}
|
||||
search.waitTermination();
|
||||
ScoreMap<String> result = search.getResults();
|
||||
Iterator<String> i = result.keys(true);
|
||||
String u;
|
||||
while (i.hasNext()) {
|
||||
u = i.next();
|
||||
System.out.println("[" + result.get(u) + "] " + u);
|
||||
}
|
||||
try {HTTPClient.closeConnectionManager();} catch (InterruptedException e) {}
|
||||
}
|
||||
}
|
@ -0,0 +1,201 @@
|
||||
/**
|
||||
* AccumulateSRURSS
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 06.01.2011 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.http.entity.mime.content.ContentBody;
|
||||
import org.apache.http.entity.mime.content.StringBody;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.protocol.http.HTTPConnector;
|
||||
|
||||
public class SearchSRURSS extends Thread implements SearchAccumulator {
|
||||
|
||||
private final static int recordsPerSession = 10;
|
||||
|
||||
final String urlBase;
|
||||
final String query;
|
||||
final long timeoutInit;
|
||||
final int maximumRecordsInit;
|
||||
final boolean verify;
|
||||
final boolean global;
|
||||
final Map<RSSMessage, List<Integer>> result;
|
||||
|
||||
private final BlockingQueue<RSSMessage> results;
|
||||
|
||||
public SearchSRURSS(
|
||||
final Map<RSSMessage, List<Integer>> result,
|
||||
final String query,
|
||||
final long timeoutInit,
|
||||
final String urlBase,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global) {
|
||||
this.results = new LinkedBlockingQueue<RSSMessage>();
|
||||
this.result = result;
|
||||
this.query = query;
|
||||
this.timeoutInit = timeoutInit;
|
||||
this.urlBase = urlBase;
|
||||
this.maximumRecordsInit = maximumRecordsInit;
|
||||
this.verify = verify;
|
||||
this.global = global;
|
||||
}
|
||||
|
||||
public SearchSRURSS(
|
||||
final SearchHub search,
|
||||
final String urlBase,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global) {
|
||||
this.results = new LinkedBlockingQueue<RSSMessage>();
|
||||
this.result = search.getAccumulation();
|
||||
this.query = search.getQuery();
|
||||
this.timeoutInit = search.getTimeout();
|
||||
this.urlBase = urlBase;
|
||||
this.maximumRecordsInit = maximumRecordsInit;
|
||||
this.verify = verify;
|
||||
this.global = global;
|
||||
}
|
||||
|
||||
public void run() {
|
||||
searchSRURSS(results, urlBase, query, timeoutInit, maximumRecordsInit, verify, global);
|
||||
int p = 1;
|
||||
RSSMessage message;
|
||||
try {
|
||||
while ((message = results.poll(timeoutInit, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) {
|
||||
if (message == null) break;
|
||||
List<Integer> m = result.get(message.getLink());
|
||||
if (m == null) m = new ArrayList<Integer>();
|
||||
m.add(new Integer(p++));
|
||||
result.put(message, m);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static Thread searchSRURSS(
|
||||
final BlockingQueue<RSSMessage> queue,
|
||||
final String urlBase,
|
||||
final String query,
|
||||
final long timeoutInit,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global) {
|
||||
Thread job = new Thread() {
|
||||
public void run() {
|
||||
int startRecord = 0;
|
||||
RSSMessage message;
|
||||
int maximumRecords = maximumRecordsInit;
|
||||
long timeout = timeoutInit;
|
||||
mainloop: while (timeout > 0 && maximumRecords > 0) {
|
||||
long st = System.currentTimeMillis();
|
||||
RSSFeed feed;
|
||||
try {
|
||||
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global);
|
||||
} catch (IOException e1) {
|
||||
break mainloop;
|
||||
}
|
||||
if (feed == null || feed.isEmpty()) break mainloop;
|
||||
maximumRecords -= feed.size();
|
||||
innerloop: while (!feed.isEmpty()) {
|
||||
message = feed.pollMessage();
|
||||
if (message == null) break innerloop;
|
||||
try {
|
||||
queue.put(message);
|
||||
} catch (InterruptedException e) {
|
||||
break innerloop;
|
||||
}
|
||||
}
|
||||
startRecord += recordsPerSession;
|
||||
timeout -= System.currentTimeMillis() - st;
|
||||
}
|
||||
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
|
||||
}
|
||||
};
|
||||
job.start();
|
||||
return job;
|
||||
}
|
||||
|
||||
/**
|
||||
* send a query to a yacy public search interface
|
||||
* @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
|
||||
* @param query the query as string
|
||||
* @param startRecord number of first record
|
||||
* @param maximumRecords maximum number of records
|
||||
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
|
||||
* @param global if true also search results from other peers are included
|
||||
* @param timeout milliseconds that are waited at maximum for a search result
|
||||
* @return
|
||||
*/
|
||||
public static RSSFeed loadSRURSS(
|
||||
String rssSearchServiceURL,
|
||||
String query,
|
||||
long timeout,
|
||||
int startRecord,
|
||||
int maximumRecords,
|
||||
boolean verify,
|
||||
boolean global) throws IOException {
|
||||
MultiProtocolURI uri = null;
|
||||
try {
|
||||
uri = new MultiProtocolURI(rssSearchServiceURL);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
|
||||
}
|
||||
|
||||
// send request
|
||||
try {
|
||||
final LinkedHashMap<String,ContentBody> parts = new LinkedHashMap<String,ContentBody>();
|
||||
parts.put("query", new StringBody(query));
|
||||
parts.put("startRecord", new StringBody(Integer.toString(startRecord)));
|
||||
parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords)));
|
||||
parts.put("verify", new StringBody(verify ? "true" : "false"));
|
||||
parts.put("resource", new StringBody(global ? "global" : "local"));
|
||||
final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
|
||||
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
|
||||
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
|
||||
if (reader == null) {
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
|
||||
}
|
||||
final RSSFeed feed = reader.getFeed();
|
||||
if (feed == null) {
|
||||
// case where the rss reader does not understand the content
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
|
||||
}
|
||||
return feed;
|
||||
} catch (final IOException e) {
|
||||
throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue