# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
The classes that actually handle the downloads.
"""
import os
import sys
import ftplib
import warnings
import requests
from .utils import parse_url
try:
from tqdm import tqdm
except ImportError:
tqdm = None
try:
import paramiko
except ImportError:
paramiko = None
def choose_downloader(url, progressbar=False):
"""
Choose the appropriate downloader for the given URL based on the protocol.
Parameters
----------
url : str
A URL (including protocol).
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. Alternatively, an arbitrary progress bar object can be
passed. See :ref:`custom-progressbar` for details.
Returns
-------
downloader
A downloader class, like :class:`pooch.HTTPDownloader`,
:class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
Examples
--------
>>> downloader = choose_downloader("http://something.com")
>>> print(downloader.__class__.__name__)
HTTPDownloader
>>> downloader = choose_downloader("https://something.com")
>>> print(downloader.__class__.__name__)
HTTPDownloader
>>> downloader = choose_downloader("ftp://something.com")
>>> print(downloader.__class__.__name__)
FTPDownloader
>>> downloader = choose_downloader("doi:DOI/filename.csv")
>>> print(downloader.__class__.__name__)
DOIDownloader
"""
known_downloaders = {
"ftp": FTPDownloader,
"https": HTTPDownloader,
"http": HTTPDownloader,
"sftp": SFTPDownloader,
"doi": DOIDownloader,
}
parsed_url = parse_url(url)
if parsed_url["protocol"] not in known_downloaders:
raise ValueError(
f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
f"Must be one of {known_downloaders.keys()}."
)
downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
return downloader
[docs]class HTTPDownloader: # pylint: disable=too-few-public-methods
"""
Download manager for fetching files over HTTP/HTTPS.
When called, downloads the given file URL into the specified local file.
Uses the :mod:`requests` library to manage downloads.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
the download of files (for example, to use authentication or print a
progress bar).
Parameters
----------
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. Alternatively, an arbitrary progress bar object can be
passed. See :ref:`custom-progressbar` for details.
chunk_size : int
Files are streamed *chunk_size* bytes at a time instead of loading
everything into memory at one. Usually doesn't need to be changed.
**kwargs
All keyword arguments given when creating an instance of this class
will be passed to :func:`requests.get`.
Examples
--------
Download one of the data files from the Pooch repository:
>>> import os
>>> from pooch import __version__, check_version
>>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
>>> url = url.format(check_version(__version__, fallback="main"))
>>> downloader = HTTPDownloader()
>>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
>>> os.path.exists("tiny-data.txt")
True
>>> with open("tiny-data.txt") as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> os.remove("tiny-data.txt")
Authentication can be handled by passing a user name and password to
:func:`requests.get`. All arguments provided when creating an instance of
the class are forwarded to :func:`requests.get`. We'll use
``auth=(username, password)`` to use basic HTTPS authentication. The
https://httpbin.org website allows us to make a fake a login request using
whatever username and password we provide to it:
>>> user = "doggo"
>>> password = "goodboy"
>>> # httpbin will ask for the user and password we provide in the URL
>>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
>>> # Trying without the login credentials causes an error
>>> downloader = HTTPDownloader()
>>> try:
... downloader(url=url, output_file="tiny-data.txt", pooch=None)
... except Exception:
... print("There was an error!")
There was an error!
>>> # Pass in the credentials to HTTPDownloader
>>> downloader = HTTPDownloader(auth=(user, password))
>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
>>> with open("tiny-data.txt") as f:
... for line in f:
... print(line.rstrip())
{
"authenticated": true,
"user": "doggo"
}
>>> os.remove("tiny-data.txt")
"""
def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
self.kwargs = kwargs
self.progressbar = progressbar
self.chunk_size = chunk_size
if self.progressbar is True and tqdm is None:
raise ValueError("Missing package 'tqdm' required for progress bars.")
[docs] def __call__(self, url, output_file, pooch, check_only=False):
"""
Download the given URL over HTTP to the given output file.
Uses :func:`requests.get`.
Parameters
----------
url : str
The URL to the file you want to download.
output_file : str or file-like object
Path (and file name) to which the file will be downloaded.
pooch : :class:`~pooch.Pooch`
The instance of :class:`~pooch.Pooch` that is calling this method.
check_only : bool
If True, will only check if a file exists on the server and
**without downloading the file**. Will return ``True`` if the file
exists and ``False`` otherwise.
Returns
-------
availability : bool or None
If ``check_only==True``, returns a boolean indicating if the file
is available on the server. Otherwise, returns ``None``.
"""
if check_only:
response = requests.head(url, allow_redirects=True)
available = bool(response.status_code == 200)
return available
kwargs = self.kwargs.copy()
kwargs.setdefault("stream", True)
ispath = not hasattr(output_file, "write")
if ispath:
output_file = open(output_file, "w+b")
try:
response = requests.get(url, **kwargs)
response.raise_for_status()
content = response.iter_content(chunk_size=self.chunk_size)
total = int(response.headers.get("content-length", 0))
if self.progressbar is True:
# Need to use ascii characters on Windows because there isn't
# always full unicode support
# (see https://github.com/tqdm/tqdm/issues/454)
use_ascii = bool(sys.platform == "win32")
progress = tqdm(
total=total,
ncols=79,
ascii=use_ascii,
unit="B",
unit_scale=True,
leave=True,
)
elif self.progressbar:
progress = self.progressbar
progress.total = total
for chunk in content:
if chunk:
output_file.write(chunk)
output_file.flush()
if self.progressbar:
# Use the chunk size here because chunk may be much
# larger if the data are decompressed by requests after
# reading (happens with text files).
progress.update(self.chunk_size)
# Make sure the progress bar gets filled even if the actual number
# is chunks is smaller than expected. This happens when streaming
# text files that are compressed by the server when sending (gzip).
# Binary files don't experience this.
if self.progressbar:
progress.reset()
progress.update(total)
progress.close()
finally:
if ispath:
output_file.close()
return None
[docs]class FTPDownloader: # pylint: disable=too-few-public-methods
"""
Download manager for fetching files over FTP.
When called, downloads the given file URL into the specified local file.
Uses the :mod:`ftplib` module to manage downloads.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
the download of files (for example, to use authentication or print a
progress bar).
Parameters
----------
port : int
Port used for the FTP connection.
username : str
User name used to login to the server. Only needed if the server
requires authentication (i.e., no anonymous FTP).
password : str
Password used to login to the server. Only needed if the server
requires authentication (i.e., no anonymous FTP). Use the empty string
to indicate no password is required.
account : str
Some servers also require an "account" name for authentication.
timeout : int
Timeout in seconds for ftp socket operations, use None to mean no
timeout.
progressbar : bool
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. **Custom progress bars are not yet supported.**
chunk_size : int
Files are streamed *chunk_size* bytes at a time instead of loading
everything into memory at one. Usually doesn't need to be changed.
"""
def __init__(
self,
port=21,
username="anonymous",
password="",
account="",
timeout=None,
progressbar=False,
chunk_size=1024,
):
self.port = port
self.username = username
self.password = password
self.account = account
self.timeout = timeout
self.progressbar = progressbar
self.chunk_size = chunk_size
if self.progressbar is True and tqdm is None:
raise ValueError("Missing package 'tqdm' required for progress bars.")
[docs] def __call__(self, url, output_file, pooch, check_only=False):
"""
Download the given URL over FTP to the given output file.
Parameters
----------
url : str
The URL to the file you want to download.
output_file : str or file-like object
Path (and file name) to which the file will be downloaded.
pooch : :class:`~pooch.Pooch`
The instance of :class:`~pooch.Pooch` that is calling this method.
check_only : bool
If True, will only check if a file exists on the server and
**without downloading the file**. Will return ``True`` if the file
exists and ``False`` otherwise.
Returns
-------
availability : bool or None
If ``check_only==True``, returns a boolean indicating if the file
is available on the server. Otherwise, returns ``None``.
"""
parsed_url = parse_url(url)
ftp = ftplib.FTP(timeout=self.timeout)
ftp.connect(host=parsed_url["netloc"], port=self.port)
if check_only:
directory, file_name = os.path.split(parsed_url["path"])
try:
ftp.login(user=self.username, passwd=self.password, acct=self.account)
available = file_name in ftp.nlst(directory)
finally:
ftp.close()
return available
ispath = not hasattr(output_file, "write")
if ispath:
output_file = open(output_file, "w+b")
try:
ftp.login(user=self.username, passwd=self.password, acct=self.account)
command = f"RETR {parsed_url['path']}"
if self.progressbar:
# Make sure the file is set to binary mode, otherwise we can't
# get the file size. See: https://stackoverflow.com/a/22093848
ftp.voidcmd("TYPE I")
use_ascii = bool(sys.platform == "win32")
progress = tqdm(
total=int(ftp.size(parsed_url["path"])),
ncols=79,
ascii=use_ascii,
unit="B",
unit_scale=True,
leave=True,
)
with progress:
def callback(data):
"Update the progress bar and write to output"
progress.update(len(data))
output_file.write(data)
ftp.retrbinary(command, callback, blocksize=self.chunk_size)
else:
ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
finally:
ftp.quit()
if ispath:
output_file.close()
return None
[docs]class SFTPDownloader: # pylint: disable=too-few-public-methods
"""
Download manager for fetching files over SFTP.
When called, downloads the given file URL into the specified local file.
Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
installed.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
the download of files (for example, to use authentication or print a
progress bar).
Parameters
----------
port : int
Port used for the SFTP connection.
username : str
User name used to login to the server. Only needed if the server
requires authentication (i.e., no anonymous SFTP).
password : str
Password used to login to the server. Only needed if the server
requires authentication (i.e., no anonymous SFTP). Use the empty
string to indicate no password is required.
timeout : int
Timeout in seconds for sftp socket operations, use None to mean no
timeout.
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard
error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
be installed.
"""
def __init__(
self,
port=22,
username="anonymous",
password="",
account="",
timeout=None,
progressbar=False,
):
self.port = port
self.username = username
self.password = password
self.account = account
self.timeout = timeout
self.progressbar = progressbar
# Collect errors and raise only once so that both missing packages are
# captured. Otherwise, the user is only warned of one of them at a
# time (and we can't test properly when they are both missing).
errors = []
if self.progressbar and tqdm is None:
errors.append("Missing package 'tqdm' required for progress bars.")
if paramiko is None:
errors.append("Missing package 'paramiko' required for SFTP downloads.")
if errors:
raise ValueError(" ".join(errors))
[docs] def __call__(self, url, output_file, pooch):
"""
Download the given URL over SFTP to the given output file.
The output file must be given as a string (file name/path) and not an
open file object! Otherwise, paramiko cannot save to that file.
Parameters
----------
url : str
The URL to the file you want to download.
output_file : str
Path (and file name) to which the file will be downloaded. **Cannot
be a file object**.
pooch : :class:`~pooch.Pooch`
The instance of :class:`~pooch.Pooch` that is calling this method.
"""
parsed_url = parse_url(url)
connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
sftp = None
try:
connection.connect(username=self.username, password=self.password)
sftp = paramiko.SFTPClient.from_transport(connection)
sftp.get_channel().settimeout = self.timeout
if self.progressbar:
size = int(sftp.stat(parsed_url["path"]).st_size)
use_ascii = bool(sys.platform == "win32")
progress = tqdm(
total=size,
ncols=79,
ascii=use_ascii,
unit="B",
unit_scale=True,
leave=True,
)
if self.progressbar:
with progress:
def callback(current, total):
"Update the progress bar and write to output"
progress.total = int(total)
progress.update(int(current - progress.n))
sftp.get(parsed_url["path"], output_file, callback=callback)
else:
sftp.get(parsed_url["path"], output_file)
finally:
connection.close()
if sftp is not None:
sftp.close()
[docs]class DOIDownloader: # pylint: disable=too-few-public-methods
"""
Download manager for fetching files from Digital Object Identifiers (DOIs).
Open-access data repositories often issue Digital Object Identifiers (DOIs)
for data which provide a stable link and citation point. The trick is
finding out the download URL for a file given the DOI.
When called, this downloader uses the repository's public API to find out
the download URL from the DOI and file name. It then uses
:class:`pooch.HTTPDownloader` to download the URL into the specified local
file. Allowing "URL"s to be specified with the DOI instead of the actual
HTTP download link. Uses the :mod:`requests` library to manage downloads
and interact with the APIs.
The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
Notice that there are no ``//`` like in HTTP/FTP and you must specify a
file name after the DOI (separated by a ``/``).
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
download files given the DOI instead of an HTTP link.
Supported repositories:
* `figshare <https://www.figshare.com>`__
* `Zenodo <https://www.zenodo.org>`__
* `DataVerse <https://dataverse.org/>`__ instances
.. attention::
DOIs from other repositories **will not work** since we need to access
their particular APIs to find the download links. We welcome
suggestions and contributions adding new repositories.
Parameters
----------
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. Alternatively, an arbitrary progress bar object can be
passed. See :ref:`custom-progressbar` for details.
chunk_size : int
Files are streamed *chunk_size* bytes at a time instead of loading
everything into memory at one. Usually doesn't need to be changed.
**kwargs
All keyword arguments given when creating an instance of this class
will be passed to :func:`requests.get`.
Examples
--------
Download one of the data files from the figshare archive of Pooch test
data:
>>> import os
>>> downloader = DOIDownloader()
>>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
>>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
>>> os.path.exists("tiny-data.txt")
True
>>> with open("tiny-data.txt") as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> os.remove("tiny-data.txt")
Same thing but for our Zenodo archive:
>>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
>>> os.path.exists("tiny-data.txt")
True
>>> with open("tiny-data.txt") as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> os.remove("tiny-data.txt")
"""
def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
self.kwargs = kwargs
self.progressbar = progressbar
self.chunk_size = chunk_size
[docs] def __call__(self, url, output_file, pooch):
"""
Download the given DOI URL over HTTP to the given output file.
Uses the repository's API to determine the actual HTTP download URL
from the given DOI.
Uses :func:`requests.get`.
Parameters
----------
url : str
The URL to the file you want to download.
output_file : str or file-like object
Path (and file name) to which the file will be downloaded.
pooch : :class:`~pooch.Pooch`
The instance of :class:`~pooch.Pooch` that is calling this method.
"""
parsed_url = parse_url(url)
data_repository = doi_to_repository(parsed_url["netloc"])
# Resolve the URL
file_name = parsed_url["path"]
# remove the leading slash in the path
if file_name[0] == "/":
file_name = file_name[1:]
download_url = data_repository.download_url(file_name)
# Instantiate the downloader object
downloader = HTTPDownloader(
progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
)
downloader(download_url, output_file, pooch)
def doi_to_url(doi):
"""
Follow a DOI link to resolve the URL of the archive.
Parameters
----------
doi : str
The DOI of the archive.
Returns
-------
url : str
The URL of the archive in the data repository.
"""
# Use doi.org to resolve the DOI to the repository website.
response = requests.get(f"https://doi.org/{doi}")
url = response.url
if 400 <= response.status_code < 600:
raise ValueError(
f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
)
return url
def doi_to_repository(doi):
"""
Instantiate a data repository instance from a given DOI.
This function implements the chain of responsibility dispatch
to the correct data repository class.
Parameters
----------
doi : str
The DOI of the archive.
Returns
-------
data_repository : DataRepository
The data repository object
"""
# This should go away in a separate issue: DOI handling should
# not rely on the (non-)existence of trailing slashes. The issue
# is documented in https://github.com/fatiando/pooch/issues/324
if doi[-1] == "/":
doi = doi[:-1]
repositories = [
FigshareRepository,
ZenodoRepository,
DataverseRepository,
]
# Extract the DOI and the repository information
archive_url = doi_to_url(doi)
# Try the converters one by one until one of them returned a URL
data_repository = None
for repo in repositories:
if data_repository is None:
data_repository = repo.initialize(
archive_url=archive_url,
doi=doi,
)
if data_repository is None:
repository = parse_url(archive_url)["netloc"]
raise ValueError(
f"Invalid data repository '{repository}'. "
"To request or contribute support for this repository, "
"please open an issue at https://github.com/fatiando/pooch/issues"
)
return data_repository
class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
@classmethod
def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""
return None # pragma: no cover
def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
"""
raise NotImplementedError # pragma: no cover
def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
"""
raise NotImplementedError # pragma: no cover
class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
def __init__(self, doi, archive_url):
self.archive_url = archive_url
self.doi = doi
self._api_response = None
@classmethod
def initialize(cls, doi, archive_url):
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""
# Check whether this is a Zenodo URL
parsed_archive_url = parse_url(archive_url)
if parsed_archive_url["netloc"] != "zenodo.org":
return None
return cls(doi, archive_url)
@property
def api_response(self):
"""Cached API response from Zenodo"""
if self._api_response is None:
article_id = self.archive_url.split("/")[-1]
self._api_response = requests.get(
f"https://zenodo.org/api/records/{article_id}"
).json()
return self._api_response
def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
"""
files = {item["key"]: item for item in self.api_response["files"]}
if file_name not in files:
raise ValueError(
f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
)
download_url = files[file_name]["links"]["self"]
return download_url
def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
"""
for filedata in self.api_response["files"]:
pooch.registry[filedata["key"]] = filedata["checksum"]
class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
def __init__(self, doi, archive_url):
self.archive_url = archive_url
self.doi = doi
self._api_response = None
@classmethod
def initialize(cls, doi, archive_url):
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""
# Check whether this is a Figshare URL
parsed_archive_url = parse_url(archive_url)
if parsed_archive_url["netloc"] != "figshare.com":
return None
return cls(doi, archive_url)
def _parse_version_from_doi(self):
"""
Parse version from the doi
Return None if version is not available in the doi.
"""
# Get suffix of the doi
_, suffix = self.doi.split("/")
# Split the suffix by dots and keep the last part
last_part = suffix.split(".")[-1]
# Parse the version from the last part
if last_part[0] != "v":
return None
version = int(last_part[1:])
return version
@property
def api_response(self):
"""Cached API response from Figshare"""
if self._api_response is None:
# Use the figshare API to find the article ID from the DOI
article = requests.get(
f"https://api.figshare.com/v2/articles?doi={self.doi}"
).json()[0]
article_id = article["id"]
# Parse desired version from the doi
version = self._parse_version_from_doi()
# With the ID and version, we can get a list of files and their
# download links
if version is None:
# Figshare returns the latest version available when no version
# is specified through the DOI.
warnings.warn(
f"The Figshare DOI '{self.doi}' doesn't specify which version of "
"the repository should be used. "
"Figshare will point to the latest version available.",
UserWarning,
)
# Define API url using only the article id
# (figshare will resolve the latest version)
api_url = f"https://api.figshare.com/v2/articles/{article_id}"
else:
# Define API url using article id and the desired version
# Get list of files using article id and the version
api_url = (
"https://api.figshare.com/v2/articles/"
f"{article_id}/versions/{version}"
)
# Make the request and return the files in the figshare repository
response = requests.get(api_url)
response.raise_for_status()
self._api_response = response.json()["files"]
return self._api_response
def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
"""
files = {item["name"]: item for item in self.api_response}
if file_name not in files:
raise ValueError(
f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
)
download_url = files[file_name]["download_url"]
return download_url
def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
"""
for filedata in self.api_response:
pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
def __init__(self, doi, archive_url):
self.archive_url = archive_url
self.doi = doi
self._api_response = None
@classmethod
def initialize(cls, doi, archive_url):
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""
# Access the DOI as if this was a DataVerse instance
response = cls._get_api_response(doi, archive_url)
# If we failed, this is probably not a DataVerse instance
if 400 <= response.status_code < 600:
return None
# Initialize the repository and overwrite the api response
repository = cls(doi, archive_url)
repository.api_response = response
return repository
@classmethod
def _get_api_response(cls, doi, archive_url):
"""
Perform the actual API request
This has been separated into a separate ``classmethod``, as it can be
used prior and after the initialization.
"""
parsed = parse_url(archive_url)
response = requests.get(
f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
f":persistentId?persistentId=doi:{doi}"
)
return response
@property
def api_response(self):
"""Cached API response from a DataVerse instance"""
if self._api_response is None:
self._api_response = self._get_api_response(
self.doi, self.archive_url
) # pragma: no cover
return self._api_response
@api_response.setter
def api_response(self, response):
"""Update the cached API response"""
self._api_response = response
def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
"""
parsed = parse_url(self.archive_url)
# Iterate over the given files until we find one of the requested name
for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
if file_name == filedata["dataFile"]["filename"]:
return (
f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
f":persistentId?persistentId={filedata['dataFile']['persistentId']}"
)
raise ValueError(
f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
)
def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
"""
for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
pooch.registry[
filedata["dataFile"]["filename"]
] = f"md5:{filedata['dataFile']['md5']}"