Source code for atomistic_cookbook_utils._download

"""HTTP download with retries on transient errors."""

from __future__ import annotations

from pathlib import Path
from typing import Tuple, Union

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


PathLike = Union[str, Path]


[docs] def download_with_retry( url: str, destination: PathLike, *, retries: int = 5, backoff_factor: float = 1.0, status_forcelist: Tuple[int, ...] = (429, 500, 502, 503, 504), overwrite: bool = False, chunk_size: int = 1 << 16, ) -> Path: """Download ``url`` to ``destination`` with automatic retries. The destination's parent directories are created as needed. If the file already exists and ``overwrite`` is false (the default), the download is skipped and the existing path is returned. Retries follow urllib3's :class:`~urllib3.util.retry.Retry` with an exponential backoff. With the defaults (``retries=5``, ``backoff_factor=1``) the waits between attempts are roughly 1, 2, 4, 8, 16 seconds. Parameters ---------- url URL to fetch. destination Where to save the file. retries Maximum number of retry attempts on the status codes listed in ``status_forcelist`` (and on connect/read errors). backoff_factor Exponential backoff factor passed to urllib3 ``Retry``. status_forcelist HTTP status codes that trigger a retry. overwrite If true, re-download even if the destination already exists. chunk_size Streaming chunk size in bytes. Returns ------- pathlib.Path The resolved destination path. """ destination = Path(destination) if destination.exists() and not overwrite: return destination destination.parent.mkdir(parents=True, exist_ok=True) retry = Retry( total=retries, backoff_factor=backoff_factor, status_forcelist=list(status_forcelist), allowed_methods=frozenset(["GET", "HEAD"]), ) adapter = HTTPAdapter(max_retries=retry) with requests.Session() as session: session.mount("http://", adapter) session.mount("https://", adapter) with session.get(url, stream=True) as response: response.raise_for_status() tmp_path = destination.with_suffix(destination.suffix + ".part") try: with open(tmp_path, "wb") as fh: for chunk in response.iter_content(chunk_size=chunk_size): if chunk: fh.write(chunk) tmp_path.replace(destination) except BaseException: tmp_path.unlink(missing_ok=True) raise return destination