Source code for pooch.downloaders

"""
Download hooks for Pooch.fetch
"""
import sys
import ftplib

import requests

from .utils import parse_url


try:
    from tqdm import tqdm
except ImportError:
    tqdm = None


def choose_downloader(url):
    """
    Choose the appropriate downloader for the given URL based on the protocol.

    Parameters
    ----------
    url : str
        A URL (including protocol).

    Returns
    -------
    downloader
        A downloader class (either :class:`pooch.HTTPDownloader` or
        :class:`pooch.FTPDownloader`).

    Examples
    --------

    >>> downloader = choose_downloader("http://something.com")
    >>> print(downloader.__class__.__name__)
    HTTPDownloader
    >>> downloader = choose_downloader("https://something.com")
    >>> print(downloader.__class__.__name__)
    HTTPDownloader
    >>> downloader = choose_downloader("ftp://something.com")
    >>> print(downloader.__class__.__name__)
    FTPDownloader

    """
    known_downloaders = {
        "ftp": FTPDownloader,
        "https": HTTPDownloader,
        "http": HTTPDownloader,
    }
    parsed_url = parse_url(url)
    if parsed_url["protocol"] not in known_downloaders:
        raise ValueError(
            "Unrecognized URL protocol '{}' in '{}'. Must be one of {}.".format(
                parsed_url["protocol"], url, known_downloaders.keys()
            )
        )
    downloader = known_downloaders[parsed_url["protocol"]]()
    return downloader


[docs]class HTTPDownloader: # pylint: disable=too-few-public-methods """ Download manager for fetching files over HTTP/HTTPS. When called, downloads the given file URL into the specified local file. Uses the :mod:`requests` library to manage downloads. Use with :meth:`pooch.Pooch.fetch` to customize the download of files (for example, to use authentication or print a progress bar). Parameters ---------- progressbar : bool If True, will print a progress bar of the download to standard error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be installed. chunk_size : int Files are streamed *chunk_size* bytes at a time instead of loading everything into memory at one. Usually doesn't need to be changed. **kwargs All keyword arguments given when creating an instance of this class will be passed to :func:`requests.get`. Examples -------- Download one of the data files from the Pooch repository: >>> import os >>> from pooch import version, check_version >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt" >>> url = url.format(check_version(version.full_version)) >>> downloader = HTTPDownloader() >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) >>> os.path.exists("tiny-data.txt") True >>> with open("tiny-data.txt") as f: ... print(f.read().strip()) # A tiny data file for test purposes only 1 2 3 4 5 6 >>> os.remove("tiny-data.txt") Authentication can be handled by passing a user name and password to :func:`requests.get`. All arguments provided when creating an instance of the class are forwarded to :func:`requests.get`. We'll use ``auth=(username, password)`` to use basic HTTPS authentication. The https://httpbin.org website allows us to make a fake a login request using whatever username and password we provide to it: >>> user = "doggo" >>> password = "goodboy" >>> # httpbin will ask for the user and password we provide in the URL >>> url = "https://httpbin.org/basic-auth/{}/{}".format(user, password) >>> # Trying without the login credentials causes an error >>> downloader = HTTPDownloader() >>> try: ... downloader(url=url, output_file="tiny-data.txt", pooch=None) ... except Exception: ... print("There was an error!") There was an error! >>> # Pass in the credentials to HTTPDownloader >>> downloader = HTTPDownloader(auth=(user, password)) >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) >>> with open("tiny-data.txt") as f: ... for line in f: ... print(line.rstrip()) { "authenticated": true, "user": "doggo" } >>> os.remove("tiny-data.txt") """ def __init__(self, progressbar=False, chunk_size=1024, **kwargs): self.kwargs = kwargs self.progressbar = progressbar self.chunk_size = chunk_size if self.progressbar and tqdm is None: raise ValueError("Missing package 'tqdm' required for progress bars.")
[docs] def __call__(self, url, output_file, pooch): """ Download the given URL over HTTP to the given output file. Uses :func:`requests.get`. Parameters ---------- url : str The URL to the file you want to download. output_file : str or file-like object Path (and file name) to which the file will be downloaded. pooch : :class:`~pooch.Pooch` The instance of :class:`~pooch.Pooch` that is calling this method. """ kwargs = self.kwargs.copy() kwargs.setdefault("stream", True) ispath = not hasattr(output_file, "write") if ispath: output_file = open(output_file, "w+b") try: response = requests.get(url, **kwargs) response.raise_for_status() content = response.iter_content(chunk_size=self.chunk_size) if self.progressbar: total = int(response.headers.get("content-length", 0)) # Need to use ascii characters on Windows because there isn't # always full unicode support # (see https://github.com/tqdm/tqdm/issues/454) use_ascii = bool(sys.platform == "win32") progress = tqdm( total=total, ncols=79, ascii=use_ascii, unit="B", unit_scale=True, leave=True, ) for chunk in content: if chunk: output_file.write(chunk) output_file.flush() if self.progressbar: # Use the chunk size here because chunk may be much # larger if the data are decompressed by requests after # reading (happens with text files). progress.update(self.chunk_size) # Make sure the progress bar gets filled even if the actual number # is chunks is smaller than expected. This happens when streaming # text files that are compressed by the server when sending (gzip). # Binary files don't experience this. if self.progressbar: progress.reset() progress.update(total) progress.close() finally: if ispath: output_file.close()
[docs]class FTPDownloader: # pylint: disable=too-few-public-methods """ Download manager for fetching files over FTP. When called, downloads the given file URL into the specified local file. Uses the :mod:`ftplib` module to manage downloads. Use with :meth:`pooch.Pooch.fetch` to customize the download of files (for example, to use authentication or print a progress bar). Parameters ---------- port : int Port used for the FTP connection. username : str User name used to login to the server. Only needed if the server requires authentication (i.e., no anonymous FTP). password : str Password used to login to the server. Only needed if the server requires authentication (i.e., no anonymous FTP). Use the empty string to indicate no password is required. account : str Some servers also require an "account" name for authentication. timeout : int Timeout in seconds for ftp socket operations, use None to mean no timeout. progressbar : bool If True, will print a progress bar of the download to standard error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be installed. chunk_size : int Files are streamed *chunk_size* bytes at a time instead of loading everything into memory at one. Usually doesn't need to be changed. """ def __init__( self, port=21, username="anonymous", password="", account="", timeout=None, progressbar=False, chunk_size=1024, ): self.port = port self.username = username self.password = password self.account = account self.timeout = timeout self.progressbar = progressbar self.chunk_size = chunk_size if self.progressbar and tqdm is None: raise ValueError("Missing package 'tqdm' required for progress bars.")
[docs] def __call__(self, url, output_file, pooch): """ Download the given URL over FTP to the given output file. Parameters ---------- url : str The URL to the file you want to download. output_file : str or file-like object Path (and file name) to which the file will be downloaded. pooch : :class:`~pooch.Pooch` The instance of :class:`~pooch.Pooch` that is calling this method. """ parsed_url = parse_url(url) ftp = ftplib.FTP(timeout=self.timeout) ftp.connect(host=parsed_url["netloc"], port=self.port) ispath = not hasattr(output_file, "write") if ispath: output_file = open(output_file, "w+b") try: ftp.login(user=self.username, passwd=self.password, acct=self.account) command = "RETR {}".format(parsed_url["path"]) if self.progressbar: size = int(ftp.size(parsed_url["path"])) use_ascii = bool(sys.platform == "win32") progress = tqdm( total=size, ncols=79, ascii=use_ascii, unit="B", unit_scale=True, leave=True, ) with progress: def callback(data): "Update the progress bar and write to output" progress.update(len(data)) output_file.write(data) ftp.retrbinary(command, callback, blocksize=self.chunk_size) else: ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size) finally: ftp.quit() if ispath: output_file.close()