Source code for pooch.core

# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
The main Pooch class and a factory function for it.
"""
import os
import time
import contextlib
from pathlib import Path
import shlex
import shutil


from .hashes import hash_matches, file_hash
from .utils import (
    check_version,
    get_logger,
    make_local_storage,
    cache_location,
    temporary_file,
    os_cache,
    unique_file_name,
)
from .downloaders import DOIDownloader, choose_downloader, doi_to_repository


[docs] def retrieve( url, known_hash, fname=None, path=None, processor=None, downloader=None, progressbar=False, ): """ Download and cache a single file locally. Uses HTTP or FTP by default, depending on the protocol in the given *url*. Other download methods can be controlled through the *downloader* argument (see below). The file will be downloaded to a temporary location first and its hash will be compared to the given *known_hash*. This is done to ensure that the download happened correctly and securely. If the hash doesn't match, the file will be deleted and an exception will be raised. If the file already exists locally, its hash will be compared to *known_hash*. If they are not the same, this is interpreted as the file needing to be updated and it will be downloaded again. You can bypass these checks by passing ``known_hash=None``. If this is done, the SHA256 hash of the downloaded file will be logged to the screen. It is highly recommended that you copy and paste this hash as *known_hash* so that future downloads are guaranteed to be the exact same file. This is crucial for reproducible computations. If the file exists in the given *path* with the given *fname* and the hash matches, it will not be downloaded and the absolute path to the file will be returned. .. note:: This function is meant for downloading single files. If you need to manage the download and caching of several files, with versioning, use :func:`pooch.create` and :class:`pooch.Pooch` instead. Parameters ---------- url : str The URL to the file that is to be downloaded. Ideally, the URL should end in a file name. known_hash : str or None A known hash (checksum) of the file. Will be used to verify the download or check if an existing file needs to be updated. By default, will assume it's a SHA256 hash. To specify a different hashing method, prepend the hash with ``algorithm:``, for example ``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``. If None, will NOT check the hash of the downloaded file or check if an existing file needs to be updated. fname : str or None The name that will be used to save the file. Should NOT include the full path, just the file name (it will be appended to *path*). If None, will create a unique file name using a combination of the last part of the URL (assuming it's the file name) and the MD5 hash of the URL. For example, ``81whdo2d2e928yd1wi22-data-file.csv``. This ensures that files from different URLs never overwrite each other, even if they have the same name. path : str or PathLike or None The location of the cache folder on disk. This is where the file will be saved. If None, will save to a ``pooch`` folder in the default cache location for your operating system (see :func:`pooch.os_cache`). processor : None or callable If not None, then a function (or callable object) that will be called before returning the full path and after the file has been downloaded (if required). See :ref:`processors` for details. downloader : None or callable If not None, then a function (or callable object) that will be called to download a given URL to a provided local file name. See :ref:`downloaders` for details. progressbar : bool or an arbitrary progress bar object If True, will print a progress bar of the download to standard error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be installed. Alternatively, an arbitrary progress bar object can be passed. See :ref:`custom-progressbar` for details. Returns ------- full_path : str The absolute path (including the file name) of the file in the local storage. Examples -------- Download one of the data files from the Pooch repository on GitHub: >>> import os >>> from pooch import __version__, check_version, retrieve >>> # Make a URL for the version of pooch we have installed >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt" >>> url = url.format(check_version(__version__, fallback="main")) >>> # Download the file and save it locally. Will check the MD5 checksum of >>> # the downloaded file against the given value to make sure it's the >>> # right file. You can use other hashes by specifying different >>> # algorithm names (sha256, sha1, etc). >>> fname = retrieve( ... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e", ... ) >>> with open(fname) as f: ... print(f.read().strip()) # A tiny data file for test purposes only 1 2 3 4 5 6 >>> # Running again won't trigger a download and only return the path to >>> # the existing file. >>> fname2 = retrieve( ... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e", ... ) >>> print(fname2 == fname) True >>> os.remove(fname) Files that are compressed with gzip, xz/lzma, or bzip2 can be automatically decompressed by passing using the :class:`pooch.Decompress` processor: >>> from pooch import Decompress >>> # URLs to a gzip compressed version of the data file. >>> url = ("https://github.com/fatiando/pooch/raw/{}/" ... + "pooch/tests/data/tiny-data.txt.gz") >>> url = url.format(check_version(__version__, fallback="main")) >>> # By default, you would have to decompress the file yourself >>> fname = retrieve( ... url, ... known_hash="md5:8812ba10b6c7778014fdae81b03f9def", ... ) >>> print(os.path.splitext(fname)[1]) .gz >>> # Use the processor to decompress after download automatically and >>> # return the path to the decompressed file instead. >>> fname2 = retrieve( ... url, ... known_hash="md5:8812ba10b6c7778014fdae81b03f9def", ... processor=Decompress(), ... ) >>> print(fname2 == fname) False >>> with open(fname2) as f: ... print(f.read().strip()) # A tiny data file for test purposes only 1 2 3 4 5 6 >>> os.remove(fname) >>> os.remove(fname2) When downloading archives (zip or tar), it can be useful to unpack them after download to avoid having to do that yourself. Use the processors :class:`pooch.Unzip` or :class:`pooch.Untar` to do this automatically: >>> from pooch import Unzip >>> # URLs to a zip archive with a single data file. >>> url = ("https://github.com/fatiando/pooch/raw/{}/" ... + "pooch/tests/data/tiny-data.zip") >>> url = url.format(check_version(__version__, fallback="main")) >>> # By default, you would get the path to the archive >>> fname = retrieve( ... url, ... known_hash="md5:e9592cb46cf3514a1079051f8a148148", ... ) >>> print(os.path.splitext(fname)[1]) .zip >>> os.remove(fname) >>> # Using the processor, the archive will be unzipped and a list with the >>> # path to every file will be returned instead of a single path. >>> fnames = retrieve( ... url, ... known_hash="md5:e9592cb46cf3514a1079051f8a148148", ... processor=Unzip(), ... ) >>> # There was only a single file in our archive. >>> print(len(fnames)) 1 >>> with open(fnames[0]) as f: ... print(f.read().strip()) # A tiny data file for test purposes only 1 2 3 4 5 6 >>> for f in fnames: ... os.remove(f) """ if path is None: path = os_cache("pooch") if fname is None: fname = unique_file_name(url) # Make the path absolute. path = cache_location(path, env=None, version=None) full_path = path.resolve() / fname action, verb = download_action(full_path, known_hash) if action in ("download", "update"): # We need to write data, so create the local data directory if it # doesn't already exist. make_local_storage(path) get_logger().info( "%s data from '%s' to file '%s'.", verb, url, str(full_path), ) if downloader is None: downloader = choose_downloader(url, progressbar=progressbar) stream_download(url, full_path, known_hash, downloader, pooch=None) if known_hash is None: get_logger().info( "SHA256 hash of downloaded file: %s\n" "Use this value as the 'known_hash' argument of 'pooch.retrieve'" " to ensure that the file hasn't changed if it is downloaded again" " in the future.", file_hash(str(full_path)), ) if processor is not None: return processor(str(full_path), action, None) return str(full_path)
[docs] def create( path, base_url, version=None, version_dev="master", env=None, registry=None, urls=None, retry_if_failed=0, allow_updates=True, ): """ Create a :class:`~pooch.Pooch` with sensible defaults to fetch data files. If a version string is given, the Pooch will be versioned, meaning that the local storage folder and the base URL depend on the project version. This is necessary if your users have multiple versions of your library installed (using virtual environments) and you updated the data files between versions. Otherwise, every time a user switches environments would trigger a re-download of the data. The version string will be appended to the local storage path (for example, ``~/.mypooch/cache/v0.1``) and inserted into the base URL (for example, ``https://github.com/fatiando/pooch/raw/v0.1/data``). If the version string contains ``+XX.XXXXX``, it will be interpreted as a development version. Does **not** create the local data storage folder. The folder will only be created the first time a download is attempted with :meth:`pooch.Pooch.fetch`. This makes it safe to use this function at the module level (so it's executed on ``import`` and the resulting :class:`~pooch.Pooch` is a global variable). Parameters ---------- path : str, PathLike, list or tuple The path to the local data storage folder. If this is a list or tuple, we'll join the parts with the appropriate separator. The *version* will be appended to the end of this path. Use :func:`pooch.os_cache` for a sensible default. base_url : str Base URL for the remote data source. All requests will be made relative to this URL. The string should have a ``{version}`` formatting mark in it. We will call ``.format(version=version)`` on this string. If the URL does not end in a ``'/'``, a trailing ``'/'`` will be added automatically. version : str or None The version string for your project. Should be PEP440 compatible. If None is given, will not attempt to format *base_url* and no subfolder will be appended to *path*. version_dev : str The name used for the development version of a project. If your data is hosted on Github (and *base_url* is a Github raw link), then ``"master"`` is a good choice (default). Ignored if *version* is None. env : str or None An environment variable that can be used to overwrite *path*. This allows users to control where they want the data to be stored. We'll append *version* to the end of this value as well. registry : dict or None A record of the files that are managed by this Pooch. Keys should be the file names and the values should be their hashes. Only files in the registry can be fetched from the local storage. Files in subdirectories of *path* **must use Unix-style separators** (``'/'``) even on Windows. urls : dict or None Custom URLs for downloading individual files in the registry. A dictionary with the file names as keys and the custom URLs as values. Not all files in *registry* need an entry in *urls*. If a file has an entry in *urls*, the *base_url* will be ignored when downloading it in favor of ``urls[fname]``. retry_if_failed : int Retry a file download the specified number of times if it fails because of a bad connection or a hash mismatch. By default, downloads are only attempted once (``retry_if_failed=0``). Initially, will wait for 1s between retries and then increase the wait time by 1s with each retry until a maximum of 10s. allow_updates : bool or str Whether existing files in local storage that have a hash mismatch with the registry are allowed to update from the remote URL. If a string is passed, we will assume it's the name of an environment variable that will be checked for the true/false value. If ``False``, any mismatch with hashes in the registry will result in an error. Defaults to ``True``. Returns ------- pooch : :class:`~pooch.Pooch` The :class:`~pooch.Pooch` initialized with the given arguments. Examples -------- Create a :class:`~pooch.Pooch` for a release (v0.1): >>> pup = create(path="myproject", ... base_url="http://some.link.com/{version}/", ... version="v0.1", ... registry={"data.txt": "9081wo2eb2gc0u..."}) >>> print(pup.path.parts) # The path is a pathlib.Path ('myproject', 'v0.1') >>> # The local folder is only created when a dataset is first downloaded >>> print(pup.path.exists()) False >>> print(pup.base_url) http://some.link.com/v0.1/ >>> print(pup.registry) {'data.txt': '9081wo2eb2gc0u...'} >>> print(pup.registry_files) ['data.txt'] If this is a development version (12 commits ahead of v0.1), then the ``version_dev`` will be used (defaults to ``"master"``): >>> pup = create(path="myproject", ... base_url="http://some.link.com/{version}/", ... version="v0.1+12.do9iwd") >>> print(pup.path.parts) ('myproject', 'master') >>> print(pup.base_url) http://some.link.com/master/ Versioning is optional (but highly encouraged): >>> pup = create(path="myproject", ... base_url="http://some.link.com/", ... registry={"data.txt": "9081wo2eb2gc0u..."}) >>> print(pup.path.parts) # The path is a pathlib.Path ('myproject',) >>> print(pup.base_url) http://some.link.com/ To place the storage folder at a subdirectory, pass in a list and we'll join the path for you using the appropriate separator for your operating system: >>> pup = create(path=["myproject", "cache", "data"], ... base_url="http://some.link.com/{version}/", ... version="v0.1") >>> print(pup.path.parts) ('myproject', 'cache', 'data', 'v0.1') The user can overwrite the storage path by setting an environment variable: >>> # The variable is not set so we'll use *path* >>> pup = create(path=["myproject", "not_from_env"], ... base_url="http://some.link.com/{version}/", ... version="v0.1", ... env="MYPROJECT_DATA_DIR") >>> print(pup.path.parts) ('myproject', 'not_from_env', 'v0.1') >>> # Set the environment variable and try again >>> import os >>> os.environ["MYPROJECT_DATA_DIR"] = os.path.join("myproject", "env") >>> pup = create(path=["myproject", "not_env"], ... base_url="http://some.link.com/{version}/", ... version="v0.1", ... env="MYPROJECT_DATA_DIR") >>> print(pup.path.parts) ('myproject', 'env', 'v0.1') """ if version is not None: version = check_version(version, fallback=version_dev) base_url = base_url.format(version=version) # Don't create the cache folder here! This function is usually called in # the module context (at import time), so touching the file system is not # recommended. It could cause crashes when multiple processes/threads try # to import at the same time (which would try to create the folder several # times at once). path = cache_location(path, env, version) if isinstance(allow_updates, str): allow_updates = os.environ.get(allow_updates, "true").lower() != "false" # add trailing "/" base_url = base_url.rstrip("/") + "/" pup = Pooch( path=path, base_url=base_url, registry=registry, urls=urls, retry_if_failed=retry_if_failed, allow_updates=allow_updates, ) return pup
[docs] class Pooch: """ Manager for a local data storage that can fetch from a remote source. Avoid creating ``Pooch`` instances directly. Use :func:`pooch.create` instead. Parameters ---------- path : str The path to the local data storage folder. The path must exist in the file system. base_url : str Base URL for the remote data source. All requests will be made relative to this URL. registry : dict or None A record of the files that are managed by this good boy. Keys should be the file names and the values should be their hashes. Only files in the registry can be fetched from the local storage. Files in subdirectories of *path* **must use Unix-style separators** (``'/'``) even on Windows. urls : dict or None Custom URLs for downloading individual files in the registry. A dictionary with the file names as keys and the custom URLs as values. Not all files in *registry* need an entry in *urls*. If a file has an entry in *urls*, the *base_url* will be ignored when downloading it in favor of ``urls[fname]``. retry_if_failed : int Retry a file download the specified number of times if it fails because of a bad connection or a hash mismatch. By default, downloads are only attempted once (``retry_if_failed=0``). Initially, will wait for 1s between retries and then increase the wait time by 1s with each retry until a maximum of 10s. allow_updates : bool Whether existing files in local storage that have a hash mismatch with the registry are allowed to update from the remote URL. If ``False``, any mismatch with hashes in the registry will result in an error. Defaults to ``True``. """ def __init__( self, path, base_url, registry=None, urls=None, retry_if_failed=0, allow_updates=True, ): self.path = path self.base_url = base_url if registry is None: registry = {} self.registry = registry if urls is None: urls = {} self.urls = dict(urls) self.retry_if_failed = retry_if_failed self.allow_updates = allow_updates @property def abspath(self): "Absolute path to the local storage" return Path(os.path.abspath(os.path.expanduser(str(self.path)))) @property def registry_files(self): "List of file names on the registry" return list(self.registry)
[docs] def fetch(self, fname, processor=None, downloader=None, progressbar=False): """ Get the absolute path to a file in the local storage. If it's not in the local storage, it will be downloaded. If the hash of the file in local storage doesn't match the one in the registry, will download a new copy of the file. This is considered a sign that the file was updated in the remote storage. If the hash of the downloaded file still doesn't match the one in the registry, will raise an exception to warn of possible file corruption. Post-processing actions sometimes need to be taken on downloaded files (unzipping, conversion to a more efficient format, etc). If these actions are time or memory consuming, it would be best to do this only once right after the file is downloaded. Use the *processor* argument to specify a function that is executed after the download to perform these actions. See :ref:`processors` for details. Custom file downloaders can be provided through the *downloader* argument. By default, Pooch will determine the download protocol from the URL in the registry. If the server for a given file requires authentication (username and password), use a downloader that support these features. Downloaders can also be used to print custom messages (like a progress bar), etc. See :ref:`downloaders` for details. Parameters ---------- fname : str The file name (relative to the *base_url* of the remote data storage) to fetch from the local storage. processor : None or callable If not None, then a function (or callable object) that will be called before returning the full path and after the file has been downloaded. See :ref:`processors` for details. downloader : None or callable If not None, then a function (or callable object) that will be called to download a given URL to a provided local file name. See :ref:`downloaders` for details. progressbar : bool or an arbitrary progress bar object If True, will print a progress bar of the download to standard error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be installed. Alternatively, an arbitrary progress bar object can be passed. See :ref:`custom-progressbar` for details. Returns ------- full_path : str The absolute path (including the file name) of the file in the local storage. """ self._assert_file_in_registry(fname) url = self.get_url(fname) full_path = self.abspath / fname known_hash = self.registry[fname] action, verb = download_action(full_path, known_hash) if action == "update" and not self.allow_updates: raise ValueError( f"{fname} needs to update {full_path} but updates are disallowed." ) if action in ("download", "update"): # We need to write data, so create the local data directory if it # doesn't already exist. make_local_storage(str(self.abspath)) get_logger().info( "%s file '%s' from '%s' to '%s'.", verb, fname, url, str(self.abspath), ) if downloader is None: downloader = choose_downloader(url, progressbar=progressbar) stream_download( url, full_path, known_hash, downloader, pooch=self, retry_if_failed=self.retry_if_failed, ) if processor is not None: return processor(str(full_path), action, self) return str(full_path)
def _assert_file_in_registry(self, fname): """ Check if a file is in the registry and raise :class:`ValueError` if it's not. """ if fname not in self.registry: raise ValueError(f"File '{fname}' is not in the registry.")
[docs] def get_url(self, fname): """ Get the full URL to download a file in the registry. Parameters ---------- fname : str The file name (relative to the *base_url* of the remote data storage) to fetch from the local storage. """ self._assert_file_in_registry(fname) return self.urls.get(fname, "".join([self.base_url, fname]))
[docs] def load_registry(self, fname): """ Load entries from a file and add them to the registry. Use this if you are managing many files. Each line of the file should have file name and its hash separated by a space. Hash can specify checksum algorithm using "alg:hash" format. In case no algorithm is provided, SHA256 is used by default. Only one file per line is allowed. Custom download URLs for individual files can be specified as a third element on the line. Line comments can be added and must be prepended with ``#``. Parameters ---------- fname : str | fileobj Path (or open file object) to the registry file. """ with contextlib.ExitStack() as stack: if hasattr(fname, "read"): # It's a file object fin = fname else: # It's a file path fin = stack.enter_context(open(fname, encoding="utf-8")) for linenum, line in enumerate(fin): if isinstance(line, bytes): line = line.decode("utf-8") line = line.strip() # skip line comments if line.startswith("#"): continue elements = shlex.split(line) if not len(elements) in [0, 2, 3]: raise OSError( f"Invalid entry in Pooch registry file '{fname}': " f"expected 2 or 3 elements in line {linenum + 1} but got " f"{len(elements)}. Offending entry: '{line}'" ) if elements: file_name = elements[0] file_checksum = elements[1] if len(elements) == 3: file_url = elements[2] self.urls[file_name] = file_url self.registry[file_name] = file_checksum.lower()
[docs] def load_registry_from_doi(self): """ Populate the registry using the data repository API Fill the registry with all the files available in the data repository, along with their hashes. It will make a request to the data repository API to retrieve this information. No file is downloaded during this process. .. important:: This method is intended to be used only when the ``base_url`` is a DOI. """ # Ensure that this is indeed a DOI-based pooch downloader = choose_downloader(self.base_url) if not isinstance(downloader, DOIDownloader): raise ValueError( f"Invalid base_url '{self.base_url}': " + "Pooch.load_registry_from_doi is only implemented for DOIs" ) # Create a repository instance doi = self.base_url.replace("doi:", "") repository = doi_to_repository(doi) # Call registry population for this repository return repository.populate_registry(self)
[docs] def is_available(self, fname, downloader=None): """ Check availability of a remote file without downloading it. Use this method when working with large files to check if they are available for download. Parameters ---------- fname : str The file name (relative to the *base_url* of the remote data storage). downloader : None or callable If not None, then a function (or callable object) that will be called to check the availability of the file on the server. See :ref:`downloaders` for details. Returns ------- status : bool True if the file is available for download. False otherwise. """ self._assert_file_in_registry(fname) url = self.get_url(fname) if downloader is None: downloader = choose_downloader(url) try: available = downloader(url, None, self, check_only=True) except TypeError as error: error_msg = ( f"Downloader '{str(downloader)}' does not support availability checks." ) raise NotImplementedError(error_msg) from error return available
def download_action(path, known_hash): """ Determine the action that is needed to get the file on disk. Parameters ---------- path : PathLike The path to the file on disk. known_hash : str A known hash (checksum) of the file. Will be used to verify the download or check if an existing file needs to be updated. By default, will assume it's a SHA256 hash. To specify a different hashing method, prepend the hash with ``algorithm:``, for example ``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``. Returns ------- action, verb : str The action that must be taken and the English verb (infinitive form of *action*) used in the log: * ``'download'``: File does not exist locally and must be downloaded. * ``'update'``: File exists locally but needs to be updated. * ``'fetch'``: File exists locally and only need to inform its path. """ if not path.exists(): action = "download" verb = "Downloading" elif not hash_matches(str(path), known_hash): action = "update" verb = "Updating" else: action = "fetch" verb = "Fetching" return action, verb def stream_download(url, fname, known_hash, downloader, pooch=None, retry_if_failed=0): """ Stream the file and check that its hash matches the known one. The file is first downloaded to a temporary file name in the cache folder. It will be moved to the desired file name only if the hash matches the known hash. Otherwise, the temporary file is deleted. If the download fails for either a bad connection or a hash mismatch, we will retry the download the specified number of times in case the failure was due to a network error. """ # Lazy import requests to speed up import time import requests.exceptions # pylint: disable=C0415 # Ensure the parent directory exists in case the file is in a subdirectory. # Otherwise, move will cause an error. if not fname.parent.exists(): os.makedirs(str(fname.parent)) download_attempts = 1 + retry_if_failed max_wait = 10 for i in range(download_attempts): try: # Stream the file to a temporary so that we can safely check its # hash before overwriting the original. with temporary_file(path=str(fname.parent)) as tmp: downloader(url, tmp, pooch) hash_matches(tmp, known_hash, strict=True, source=str(fname.name)) shutil.move(tmp, str(fname)) break except (ValueError, requests.exceptions.RequestException): if i == download_attempts - 1: raise retries_left = download_attempts - (i + 1) get_logger().info( "Failed to download '%s'. " "Will attempt the download again %d more time%s.", str(fname.name), retries_left, "s" if retries_left > 1 else "", ) time.sleep(min(i + 1, max_wait))