-
Notifications
You must be signed in to change notification settings - Fork 367
Multi-part download for url_download in utils/download.py
#6247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
3aa3af6
52656fa
c97438c
a9d7c22
63a6aec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,15 +22,20 @@ | |
| import shutil | ||
| import socket | ||
| import sys | ||
| import time | ||
| import typing | ||
| import urllib.parse | ||
| from multiprocessing import Process | ||
| from concurrent.futures import ThreadPoolExecutor | ||
| from urllib.error import HTTPError | ||
| from urllib.request import urlopen | ||
| from pathlib import Path | ||
|
|
||
| from avocado.utils import crypto, output | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
||
| USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | ||
|
|
||
|
|
||
| def url_open(url, data=None, timeout=5): | ||
| """ | ||
|
|
@@ -83,24 +88,103 @@ def _url_download(url, filename, data): | |
| src_file.close() | ||
|
|
||
|
|
||
| def url_download(url, filename, data=None, timeout=300): | ||
| def _download_range( | ||
| url: str, | ||
| start: int, | ||
| end: int, | ||
| part_file: str, | ||
| retries: int = 3, | ||
| delay: int = 1, | ||
| timeout: int = 300, | ||
| ): | ||
| """ | ||
| Retrieve a file from given url. | ||
|
|
||
| :param url: source URL. | ||
| :param filename: destination path. | ||
| :param data: (optional) data to post. | ||
| Downloads file within start and end byte range into part_files | ||
| :param url: URL to download. | ||
| :param start: Start byte range. | ||
| :param end: End byte range. | ||
| :param part_file: Part file name. | ||
| :param retries: Number of retries. | ||
| :param delay: Delay in seconds. | ||
| :param timeout: Timeout in seconds. | ||
| """ | ||
| for attempt in range(1, retries + 1): | ||
| try: | ||
| req = urllib.request.Request( | ||
| url, headers={"User-Agent": USER_AGENT, "Range": f"bytes={start}-{end}"} | ||
| ) | ||
| with urlopen(req, timeout=timeout) as r, open(part_file, "wb") as f: | ||
| shutil.copyfileobj(r, f, length=1024 * 1024) # 1MB chunks | ||
| return | ||
| except (socket.timeout, TimeoutError, HTTPError) as e: | ||
| if attempt == retries: | ||
| raise e | ||
| time.sleep(delay) | ||
|
|
||
|
|
||
| def url_download( | ||
| url: str, | ||
| filename: str, | ||
| data: typing.Optional[bytes] = None, | ||
| timeout: int = 300, | ||
| segments: int = 1, | ||
| ): | ||
| """ | ||
| Multipart downloader using thread pool executor by content-length splitting | ||
| :param url: URL to download. | ||
| :param filename: Target file name or full file path | ||
| :param data: (optional) data to POST Request | ||
| :param timeout: (optional) default timeout in seconds. | ||
| :return: `None`. | ||
| :param segments: How much should we split the download into | ||
| """ | ||
| process = Process(target=_url_download, args=(url, filename, data)) | ||
| log.info("Fetching %s -> %s", url, filename) | ||
| process.start() | ||
| process.join(timeout) | ||
| if process.is_alive(): | ||
| process.terminate() | ||
| process.join() | ||
| raise OSError("Aborting downloading. Timeout was reached.") | ||
| headers = urllib.request.urlopen( | ||
| urllib.request.Request(url, method="HEAD"), timeout=10 | ||
| ).headers # Using HEAD method to get the content length | ||
| size = int(headers.get("Content-Length", -1)) | ||
| supports_range = "bytes" in headers.get("Accept-Ranges", "").lower() | ||
|
|
||
| if segments == 1 or size <= 0 or data or not supports_range: | ||
| # Use single download when size/range is unavailable, request is POST, or segment size is 1 | ||
| _url_download(url=url, filename=filename, data=data) | ||
| return | ||
|
Comment on lines
139
to
148
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing error handling for HEAD request. The HEAD request can raise - headers = urllib.request.urlopen(
- urllib.request.Request(url, method="HEAD"), timeout=10
- ).headers # Using HEAD method to get the content length
- size = int(headers.get("Content-Length", -1))
- supports_range = "bytes" in headers.get("Accept-Ranges", "").lower()
-
- if size <= 0 or data or not supports_range:
+ try:
+ head_response = urllib.request.urlopen(
+ urllib.request.Request(url, method="HEAD"), timeout=timeout
+ )
+ headers = head_response.headers
+ size = int(headers.get("Content-Length", -1))
+ supports_range = "bytes" in headers.get("Accept-Ranges", "").lower()
+ except (socket.timeout, HTTPError, urllib.error.URLError):
+ size = -1
+ supports_range = False
+
+ if size <= 0 or data or not supports_range:Note: You'll need to add
🧰 Tools🪛 Ruff (0.14.6)137-139: Audit URL open for permitted schemes. Allowing use of (S310) 138-138: Audit URL open for permitted schemes. Allowing use of (S310) |
||
|
|
||
| part_size = size // segments | ||
| path = Path(filename) # takes absolute path | ||
| part_files = [str(path.parent / f"temp{i}_{path.name}") for i in range(segments)] | ||
| if part_size == 0: | ||
| # File too small for segmentation, fall back to single download | ||
| _url_download(url=url, filename=filename, data=data) | ||
| return | ||
|
|
||
| def task(i: int): | ||
| start = i * part_size | ||
| end = size - 1 if i == segments - 1 else (start + part_size - 1) | ||
| part_file = part_files[i] | ||
| _download_range( | ||
| url=url, start=start, end=end, part_file=part_file, timeout=timeout | ||
| ) | ||
|
|
||
| try: | ||
| with ThreadPoolExecutor(max_workers=segments) as executor: | ||
| # This will raise an exception if any task fails | ||
| list(executor.map(task, range(segments))) | ||
| # Merge the split files | ||
| with open(filename, "wb") as f: | ||
| for part in part_files: | ||
| with open(part, "rb") as pf: | ||
| shutil.copyfileobj(pf, f) | ||
| except Exception as e: | ||
| # If anything fails, remove the incomplete destination file and switch to single-part download | ||
| if os.path.exists(filename): | ||
| os.remove(filename) | ||
| log.warning( | ||
| "Multipart download failed (%s). Falling back to single-part download.", e | ||
| ) | ||
| _url_download(url=url, filename=filename, data=data) | ||
| finally: | ||
| # Always clean up part files | ||
| for part in part_files: | ||
| if os.path.exists(part): | ||
| os.remove(part) | ||
|
|
||
|
|
||
| def url_download_interactive(url, output_file, title="", chunk_size=102400): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason for this specific user agent?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not really, Servers do not inherently require a user agent to enable a download, as the User-Agent header is technically optional according to HTTP specifications. However, many servers are configured to block or modify responses for requests without a user agent or with an unrecognized one