The oai ListFriends source list becomes configurable: just write them into defaults/oaiListFriendsSource.xml

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6857 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent cf13c65bdd
commit a7d038bb7a

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>Sources for OAI ListFriends entries. The Key is the URL to the ListFriends Source, the Value is the Path to the local file where the ListFriends file is stored</comment>
<entry key="http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?_action_export=1&amp;output=ROAR%3A%3AListFriends">DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml</entry>
<entry key="http://www.openarchives.org/Register/ListFriends">DATA/DICTIONARIES/harvesting/ListFriends.xml</entry>
</properties>

@ -44,7 +44,7 @@ public class IndexImportOAIPMHList_p {
prop.put("source", 0);
if (post != null && post.containsKey("source")) {
Set<String> oaiRoots = OAIListFriendsLoader.load(sb.loader).keySet();
Set<String> oaiRoots = OAIListFriendsLoader.getListFriends(sb.loader).keySet();
boolean dark = false;
int count = 0;

@ -486,7 +486,8 @@ public final class Switchboard extends serverSwitch {
// start a loader
log.logConfig("Starting Crawl Loader");
this.loader = new LoaderDispatcher(this);
OAIListFriendsLoader.init(this.loader);
Map<String, File> oaiFriends = OAIListFriendsLoader.loadListFriendsSources(new File("defaults/oaiListFriendsSource.xml"));
OAIListFriendsLoader.init(this.loader, oaiFriends);
this.crawlQueues = new CrawlQueues(this, queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta(
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),

@ -24,10 +24,14 @@ package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;
import java.util.Map.Entry;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
@ -48,63 +52,50 @@ import de.anomic.crawler.retrieval.Response;
public class OAIListFriendsLoader {
private static final long serialVersionUID = -8705115274655024604L;
private static final HashMap<String, File> listFriends = new HashMap<String, File>();
//private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?screen=ROAR%3A%3AFacetSearch&_action_export=1&output=ROAR%3A%3AListFriends&exp=1|1|-recordcount%2F-date|archive|-|-|eprint_status%3Aeprint_status%3AALL%3AEQ%3Aarchive|metadata_visibility%3Ametadata_visibility%3AALL%3AEX%3Ashow";
private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?_action_export=1&output=ROAR%3A%3AListFriends";
private static File cache10 = new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml");
private static String url20 = "http://www.openarchives.org/Register/ListFriends";
private static File cache20 = new File("DATA/DICTIONARIES/harvesting/ListFriends.xml");
public static void init(LoaderDispatcher loader) {
loader.loadIfNotExistBackground(url10, cache10);
loader.loadIfNotExistBackground(url20, cache20);
public static void init(LoaderDispatcher loader, Map<String, File> moreFriends) {
listFriends.putAll(moreFriends);
if (loader != null) for (Map.Entry<String, File> oaiFriend: listFriends.entrySet()) {
loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue());
}
}
public static Map<String, String> load(LoaderDispatcher loader) {
Map<String, String> map10;
public static Map<String, File> loadListFriendsSources(File initFile) {
Properties p = new Properties();
Map<String, File> m = new HashMap<String, File>();
try {
map10 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"));
p.loadFromXML(new FileInputStream(initFile));
} catch (IOException e) {
map10 = new TreeMap<String, String>();
Log.logException(e);
return m;
}
Map<String, String> map20;
try {
map20 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"));
} catch (IOException e) {
map20 = new TreeMap<String, String>();
}
map10.putAll(map20);
return map10;
for (Entry<Object, Object> e: p.entrySet()) m.put((String) e.getKey(), new File((String) e.getValue()));
return m;
}
/**
* load a OAI ListFriends file from the net or from a cache location
* If the given file does exist, the OAI ListFriends File is loaded and parsed.
* The resulting map is a mapping from OAI-PMH start url to a loaction description
* @param loader a LoaderDispatcher that loads the file if targetFile does not exist
* @param source the source URL for the OAI ListFriends file
* @param targetFile the file where the loaded content is stored if it does not exist, the source othervise
* @return a Map from OAI-PMH source to source description (which is usually also a URL)
* @throws IOException
*/
private static Map<String, String> load(LoaderDispatcher loader, DigestURI source, File targetFile) throws IOException {
byte[] b;
if (targetFile.exists()) {
// load file
b = FileUtils.read(targetFile);
} else {
// load from the net
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
b = response.getContent();
FileUtils.copy(b, targetFile);
}
return new Parser(b).map;
}
public static Map<String, String> getListFriends(LoaderDispatcher loader) {
Map<String, String> map = new TreeMap<String, String>();
Map<String, String> m;
for (Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
if (!oaiFriend.getValue().exists()) {
Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
}
if (oaiFriend.getValue().exists()) {
byte[] b = FileUtils.read(oaiFriend.getValue());
if (b != null) {
m = new Parser(b).map;
if (m != null) map.putAll(m);
}
}
} catch (IOException e) {}
return map;
}
// get a resumption token using a SAX xml parser from am input stream
private static class Parser extends DefaultHandler {
@ -181,23 +172,5 @@ public class OAIListFriendsLoader {
}
}
public static void main(String[] args) {
try {
Map<String, String> map1 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"));
int count1 = map1.size();
Map<String, String> map2 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"));
int count2 = map2.size();
map1.putAll(map2);
System.out.println("count1 = " + count1 + ", count2 = " + count2 + ", all = " + map1.size());
for (Map.Entry<String, String> entry: map1.entrySet()) System.out.println(entry.getKey());
} catch (IOException e) {
e.printStackTrace();
}
}
}

@ -174,7 +174,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
File surrogatesIn,
File surrogatesOut,
long staleLimit) {
Set<String> plainList = OAIListFriendsLoader.load(loader).keySet();
Set<String> plainList = OAIListFriendsLoader.getListFriends(loader).keySet();
Map<String, Date> loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut);
long limit = System.currentTimeMillis() - staleLimit;
for (Map.Entry<String, Date> a: loaded.entrySet()) {

Loading…
Cancel
Save