Source code for pooch.utils
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Misc utilities
"""
import logging
import os
import tempfile
import hashlib
from pathlib import Path
from urllib.parse import urlsplit
from contextlib import contextmanager
import warnings
import platformdirs
from packaging.version import Version
LOGGER = logging.Logger("pooch")
LOGGER.addHandler(logging.StreamHandler())
def file_hash(*args, **kwargs):
"""
WARNING: Importing this function from pooch.utils is DEPRECATED.
Please import from the top-level namespace (`from pooch import file_hash`)
instead, which is fully backwards compatible with pooch >= 0.1.
Examples
--------
>>> fname = "test-file-for-hash.txt"
>>> with open(fname, "w") as f:
... __ = f.write("content of the file")
>>> print(file_hash(fname))
0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
>>> import os
>>> os.remove(fname)
"""
# pylint: disable=import-outside-toplevel
from .hashes import file_hash as new_file_hash
message = """
Importing file_hash from pooch.utils is DEPRECATED. Please import from the
top-level namespace (`from pooch import file_hash`) instead, which is fully
backwards compatible with pooch >= 0.1.
"""
warnings.warn(message, DeprecationWarning, stacklevel=2)
return new_file_hash(*args, **kwargs)
[docs]
def get_logger():
r"""
Get the default event logger.
The logger records events like downloading files, unzipping archives, etc.
Use the method :meth:`logging.Logger.setLevel` of this object to adjust the
verbosity level from Pooch.
Returns
-------
logger : :class:`logging.Logger`
The logger object for Pooch
"""
return LOGGER
[docs]
def os_cache(project):
r"""
Default cache location based on the operating system.
The folder locations are defined by the ``platformdirs`` package
using the ``user_cache_dir`` function.
Usually, the locations will be following (see the
`platformdirs documentation <https://platformdirs.readthedocs.io>`__):
* Mac: ``~/Library/Caches/<AppName>``
* Unix: ``~/.cache/<AppName>`` or the value of the ``XDG_CACHE_HOME``
environment variable, if defined.
* Windows: ``C:\Users\<user>\AppData\Local\<AppAuthor>\<AppName>\Cache``
Parameters
----------
project : str
The project name.
Returns
-------
cache_path : :class:`pathlib.Path`
The default location for the data cache. User directories (``'~'``) are
not expanded.
"""
return Path(platformdirs.user_cache_dir(project))
[docs]
def check_version(version, fallback="master"):
"""
Check if a version is PEP440 compliant and there are no unreleased changes.
For example, ``version = "0.1"`` will be returned as is but ``version =
"0.1+10.8dl8dh9"`` will return the fallback. This is the convention used by
`versioneer <https://github.com/warner/python-versioneer>`__ to mark that
this version is 10 commits ahead of the last release.
Parameters
----------
version : str
A version string.
fallback : str
What to return if the version string has unreleased changes.
Returns
-------
version : str
If *version* is PEP440 compliant and there are unreleased changes, then
return *version*. Otherwise, return *fallback*.
Raises
------
InvalidVersion
If *version* is not PEP440 compliant.
Examples
--------
>>> check_version("0.1")
'0.1'
>>> check_version("0.1a10")
'0.1a10'
>>> check_version("0.1+111.9hdg36")
'master'
>>> check_version("0.1+111.9hdg36", fallback="dev")
'dev'
"""
parse = Version(version)
if parse.local is not None:
return fallback
return version
def parse_url(url):
"""
Parse a URL into 3 components:
<protocol>://<netloc>/<path>
Example URLs:
* http://127.0.0.1:8080/test.nc
* ftp://127.0.0.1:8080/test.nc
* doi:10.6084/m9.figshare.923450.v1/test.nc
The DOI is a special case. The protocol will be "doi", the netloc will be
the DOI, and the path is what comes after the last "/".
The only exception are Zenodo dois: the protocol will be "doi", the netloc
will be composed by the "prefix/suffix" and the path is what comes after
the second "/". This allows to support special cases of Zenodo dois where
the path contains forward slashes "/", created by the GitHub-Zenodo
integration service.
Parameters
----------
url : str
The URL.
Returns
-------
parsed_url : dict
Three components of a URL (e.g.,
``{'protocol':'http', 'netloc':'127.0.0.1:8080','path': '/test.nc'}``).
"""
if url.startswith("doi://"):
raise ValueError(
f"Invalid DOI link '{url}'. You must not use '//' after 'doi:'."
)
if url.startswith("doi:"):
protocol = "doi"
parts = url[4:].split("/")
if "zenodo" in parts[1].lower():
netloc = "/".join(parts[:2])
path = "/" + "/".join(parts[2:])
else:
netloc = "/".join(parts[:-1])
path = "/" + parts[-1]
else:
parsed_url = urlsplit(url)
protocol = parsed_url.scheme or "file"
netloc = parsed_url.netloc
path = parsed_url.path
return {"protocol": protocol, "netloc": netloc, "path": path}
def cache_location(path, env=None, version=None):
"""
Location of the cache given a base path and optional configuration.
Checks for the environment variable to overwrite the path of the local
cache. Optionally add *version* to the path if given.
Parameters
----------
path : str, PathLike, list or tuple
The path to the local data storage folder. If this is a list or tuple,
we'll join the parts with the appropriate separator. Use
:func:`pooch.os_cache` for a sensible default.
version : str or None
The version string for your project. Will be appended to given path if
not None.
env : str or None
An environment variable that can be used to overwrite *path*. This
allows users to control where they want the data to be stored. We'll
append *version* to the end of this value as well.
Returns
-------
local_path : PathLike
The path to the local directory.
"""
if env is not None and env in os.environ and os.environ[env]:
path = os.environ[env]
if isinstance(path, (list, tuple)):
path = os.path.join(*path)
if version is not None:
path = os.path.join(str(path), version)
path = os.path.expanduser(str(path))
return Path(path)
def make_local_storage(path, env=None):
"""
Create the local cache directory and make sure it's writable.
Parameters
----------
path : str or PathLike
The path to the local data storage folder.
env : str or None
An environment variable that can be used to overwrite *path*. Only used
in the error message in case the folder is not writable.
"""
path = str(path)
# Check that the data directory is writable
if not os.path.exists(path):
action = "create"
else:
action = "write to"
try:
if action == "create":
# When running in parallel, it's possible that multiple jobs will
# try to create the path at the same time. Use exist_ok to avoid
# raising an error.
os.makedirs(path, exist_ok=True)
else:
with tempfile.NamedTemporaryFile(dir=path):
pass
except PermissionError as error:
message = [
str(error),
f"| Pooch could not {action} data cache folder '{path}'.",
"Will not be able to download data files.",
]
if env is not None:
message.append(
f"Use environment variable '{env}' to specify a different location."
)
raise PermissionError(" ".join(message)) from error
@contextmanager
def temporary_file(path=None):
"""
Create a closed and named temporary file and make sure it's cleaned up.
Using :class:`tempfile.NamedTemporaryFile` will fail on Windows if trying
to open the file a second time (when passing its name to Pooch function,
for example). This context manager creates the file, closes it, yields the
file path, and makes sure it's deleted in the end.
Parameters
----------
path : str or PathLike
The directory in which the temporary file will be created.
Yields
------
fname : str
The path to the temporary file.
"""
tmp = tempfile.NamedTemporaryFile(delete=False, dir=path)
# Close the temp file so that it can be opened elsewhere
tmp.close()
try:
yield tmp.name
finally:
if os.path.exists(tmp.name):
os.remove(tmp.name)
def unique_file_name(url):
"""
Create a unique file name based on the given URL.
The file name will be unique to the URL by prepending the name with the MD5
hash (hex digest) of the URL. The name will also include the last portion
of the URL.
The format will be: ``{md5}-{filename}.{ext}``
The file name will be cropped so that the entire name (including the hash)
is less than 255 characters long (the limit on most file systems).
Parameters
----------
url : str
The URL with a file name at the end.
Returns
-------
fname : str
The file name, unique to this URL.
Examples
--------
>>> print(unique_file_name("https://www.some-server.org/2020/data.txt"))
02ddee027ce5ebb3d7059fb23d210604-data.txt
>>> print(unique_file_name("https://www.some-server.org/2019/data.txt"))
9780092867b497fca6fc87d8308f1025-data.txt
>>> print(unique_file_name("https://www.some-server.org/2020/data.txt.gz"))
181a9d52e908219c2076f55145d6a344-data.txt.gz
"""
md5 = hashlib.md5(url.encode()).hexdigest()
fname = parse_url(url)["path"].split("/")[-1]
# Crop the start of the file name to fit 255 characters including the hash
# and the :
fname = fname[-(255 - len(md5) - 1) :]
unique_name = f"{md5}-{fname}"
return unique_name