diff --git a/htroot/CrawlResults.html b/htroot/CrawlResults.html index d544f0cb8..0b52228b3 100644 --- a/htroot/CrawlResults.html +++ b/htroot/CrawlResults.html @@ -75,7 +75,7 @@ ::

(7) Results from surrogates import

These records had been imported from surrogate files in DATA/SURROGATES/in

-

Use Case: place files with dublin core metadata content into DATA/SURROGATES/in or use an index import method (i.e. wikimedia import, OAI-PMH retrieval)

+

Use Case: place files with dublin core metadata content into DATA/SURROGATES/in or use an index import method (i.e. wikimedia import, OAI-PMH retrieval)

#(/process)# diff --git a/htroot/IndexImportOAIPMH_p.html b/htroot/IndexImportOAIPMH_p.html index e99da0dcb..5970ab592 100644 --- a/htroot/IndexImportOAIPMH_p.html +++ b/htroot/IndexImportOAIPMH_p.html @@ -9,11 +9,12 @@ #%env/templates/header.template%# #%env/templates/submenuContentIntegration.template%#

OAI-PMH Import

+

Results from the import can be monitored in the indexing results for surrogates

Single request import - This will submit only a single request as given here to a OAI-PMH server and imports records into the index + This will submit only a single request as given here to a OAI-PMH server and imports records into the index
@@ -36,7 +37,7 @@
Import all Records from a server - Import all records that follow acording to resumption elements into index + Import all records that follow acording to resumption elements into index
@@ -46,7 +47,7 @@
Thread:
#[thread]#
Source:
#[source]#
-
Processed Chunks:
#[chunkCount]# records
+
Processed Chunks:
#[chunkCount]#
Imported Records:
#[recordsCount]# records
Speed:
#[speed]# records per second
diff --git a/htroot/IndexImportOAIPMH_p.java b/htroot/IndexImportOAIPMH_p.java index 136d4a76c..30fb61df4 100644 --- a/htroot/IndexImportOAIPMH_p.java +++ b/htroot/IndexImportOAIPMH_p.java @@ -73,6 +73,7 @@ public class IndexImportOAIPMH_p { if (post != null) { if (post.containsKey("urlstartone")) { String oaipmhurl = post.get("urlstartone"); + if (oaipmhurl.indexOf("?") < 0) oaipmhurl = oaipmhurl + "?verb=ListRecords&metadataPrefix=oai_dc"; DigestURI url = null; try { url = new DigestURI(oaipmhurl, null); diff --git a/source/net/yacy/document/importer/OAIPMHReader.java b/source/net/yacy/document/importer/OAIPMHReader.java index 2c28251dc..0b3b85678 100644 --- a/source/net/yacy/document/importer/OAIPMHReader.java +++ b/source/net/yacy/document/importer/OAIPMHReader.java @@ -59,7 +59,7 @@ public class OAIPMHReader { response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); byte[] b = response.getContent(); this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b)); - String file = filePrefix + "_" + this.source.getHost() + "_" + DateFormatter.formatShortMilliSecond(new Date()); + String file = filePrefix + "." + filename4source(source) + "." + DateFormatter.formatShortMilliSecond(new Date()); File f0 = new File(targetDir, file + ".tmp"); File f1 = new File(targetDir, file + ".xml"); @@ -81,6 +81,15 @@ public class OAIPMHReader { */ } + public static final String filename4source(DigestURI source) { + String s = ResumptionToken.truncatedURL(source); + if (s.endsWith("?")) s = s.substring(0, s.length() - 1); + if (s.endsWith("/")) s = s.substring(0, s.length() - 1); + if (s.startsWith("https://")) s = s.substring(8); + if (s.startsWith("http://")) s = s.substring(7); + return s.replace('.', '_').replace('/', '_').replace(':', '_'); + } + public ResumptionToken getResumptionToken() { return this.resumptionToken; } diff --git a/source/net/yacy/document/importer/ResumptionToken.java b/source/net/yacy/document/importer/ResumptionToken.java index 0bd5ac670..198b5aba1 100644 --- a/source/net/yacy/document/importer/ResumptionToken.java +++ b/source/net/yacy/document/importer/ResumptionToken.java @@ -135,11 +135,9 @@ public class ResumptionToken extends TreeMap { if (expiration != null) { if (expiration.before(new Date())) throw new IOException("the resumption is expired at " + DateFormatter.formatISO8601(expiration) + " (now: " + DateFormatter.formatISO8601(new Date())); // the resumption token is still fresh - return new DigestURI(url + "verb=ListRecords&resumptionToken=" + token, null); } - - // may still be an encoded state - return new DigestURI(url + "verb=ListRecords&" + token, null); + + return new DigestURI(url + "verb=ListRecords&resumptionToken=" + token, null); } /**