Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions csrc/mlp_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <cublas_v2.h>
#include <cuda_runtime.h>

typedef unsigned int uint;

#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
// includes cublaslt
#include <cublasLt.h>
Expand Down
25 changes: 18 additions & 7 deletions csrc/multi_tensor_axpby_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@

#include <assert.h>

#include <cmath>

#include "multi_tensor_apply.cuh"
#include "type_shim.h"

#define BLOCK_SIZE 512
#define ILP 4

Expand Down Expand Up @@ -61,9 +62,14 @@ struct AxpbyFunctor {
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
r_out[ii] = a * static_cast<float>(r_x[ii]) + b * static_cast<float>(r_y[ii]);
if (arg_to_check == -1) finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
if (arg_to_check == 0) finite = finite && isfinite(r_x[ii]);
if (arg_to_check == 1) finite = finite && isfinite(r_y[ii]);
if (arg_to_check == -1)
finite = finite && ((fabsf((float)r_x[ii]) <= 3.40282e+38f) && (fabsf((float)r_y[ii]) <= 3.40282e+38f));
// if (arg_to_check == -1) finite = finite && (std::isfinite(static_cast<float>(r_x[ii])) &&
// std::isfinite(static_cast<float>(r_y[ii])));
if (arg_to_check == 0) finite = finite && (fabsf((float)r_x[ii]) <= 3.40282e+38f);
// if (arg_to_check == 0) finite = finite && std::isfinite(static_cast<float>(r_x[ii]));
if (arg_to_check == 1) finite = finite && (fabsf((float)r_y[ii]) <= 3.40282e+38f);
// if (arg_to_check == 1) finite = finite && std::isfinite(static_cast<float>(r_y[ii]));
}
// store
load_store(out, r_out, i_start, 0);
Expand All @@ -84,9 +90,14 @@ struct AxpbyFunctor {
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
r_out[ii] = a * static_cast<float>(r_x[ii]) + b * static_cast<float>(r_y[ii]);
if (arg_to_check == -1) finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
if (arg_to_check == 0) finite = finite && isfinite(r_x[ii]);
if (arg_to_check == 1) finite = finite && isfinite(r_y[ii]);
if (arg_to_check == -1)
finite = finite && ((fabsf((float)r_x[ii]) <= 3.40282e+38f) && (fabsf((float)r_y[ii]) <= 3.40282e+38f));
// if (arg_to_check == -1) finite = finite && (std::isfinite(static_cast<float>(r_x[ii])) &&
// std::isfinite(static_cast<float>(r_y[ii])));
if (arg_to_check == 0) finite = finite && (fabsf((float)r_x[ii]) <= 3.40282e+38f);
// if (arg_to_check == 0) finite = finite && std::isfinite(static_cast<float>(r_x[ii]));
if (arg_to_check == 1) finite = finite && (fabsf((float)r_y[ii]) <= 3.40282e+38f);
// if (arg_to_check == 1) finite = finite && std::isfinite(static_cast<float>(r_y[ii]));
}
// see note in multi_tensor_scale_kernel.cu
#pragma unroll
Expand Down
7 changes: 5 additions & 2 deletions csrc/multi_tensor_scale_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <assert.h>
// Stringstream is a big hammer, but I want to rely on operator<< for dtype.
#include <cmath>
#include <sstream>

#include "multi_tensor_apply.cuh"
Expand Down Expand Up @@ -58,7 +59,8 @@ struct ScaleFunctor {
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
r_out[ii] = static_cast<float>(r_in[ii]) * scale;
finite = finite && isfinite(r_in[ii]);
finite = finite && (fabsf((float)r_in[ii]) <= 3.40282e+38f);
// finite = finite && std::isfinite(static_cast<float>(r_in[ii]));
}
// store
load_store(out, r_out, i_start, 0);
Expand All @@ -80,7 +82,8 @@ struct ScaleFunctor {
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
r_out[ii] = static_cast<float>(r_in[ii]) * scale;
finite = finite && isfinite(r_in[ii]);
finite = finite && (fabsf((float)r_in[ii]) <= 3.40282e+38f);
// finite = finite && std::isfinite(static_cast<float>(r_in[ii]));
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
Expand Down
17 changes: 11 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,12 +238,13 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
"csrc/update_scale_hysteresis.cu",
],
extra_compile_args={
"cxx": ["-O3"],
"cxx": ["-O3", "-D_DISABLE_EXTENDED_ALIGNED_STORAGE"],
"nvcc": [
"-lineinfo",
"-O3",
# '--resource-usage',
"--use_fast_math",
"-D_DISABLE_EXTENDED_ALIGNED_STORAGE",
],
},
)
Expand Down Expand Up @@ -274,19 +275,21 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
CUDAExtension(
name="mlp_cuda",
sources=["csrc/mlp.cpp", "csrc/mlp_cuda.cu"],
libraries=["cublas", "cublasLt"],
extra_compile_args={
"cxx": ["-O3"],
"nvcc": ["-O3"],
"cxx": ["-O3", "-D_DISABLE_EXTENDED_ALIGNED_STORAGE"],
"nvcc": ["-O3", "-D_DISABLE_EXTENDED_ALIGNED_STORAGE"],
},
)
)
ext_modules.append(
CUDAExtension(
name="fused_dense_cuda",
sources=["csrc/fused_dense.cpp", "csrc/fused_dense_cuda.cu"],
libraries=["cublas", "cublasLt"],
extra_compile_args={
"cxx": ["-O3"],
"nvcc": ["-O3"],
"cxx": ["-O3", "-D_DISABLE_EXTENDED_ALIGNED_STORAGE"],
"nvcc": ["-O3", "-D_DISABLE_EXTENDED_ALIGNED_STORAGE"],
},
)
)
Expand Down Expand Up @@ -405,15 +408,17 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
"csrc/megatron/fused_weight_gradient_dense_cuda.cu",
"csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu",
],
libraries=["cublas", "cublasLt"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

at glance this change looks not quite relevant, why would we need this?

extra_compile_args={
"cxx": ["-O3"],
"cxx": ["-O3", "-D_DISABLE_EXTENDED_ALIGNED_STORAGE"],
"nvcc": [
"-O3",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
"-D_DISABLE_EXTENDED_ALIGNED_STORAGE",
],
},
)
Expand Down
146 changes: 146 additions & 0 deletions windows_install.md
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not quite sure if this file should be in the repo

Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Build Fixes for NVIDIA Apex on Windows 11 (CUDA 12.8 / MSVC 2022)

## Installation Command

Make sure you run below commands in **x64 Native Tools Command Prompt for VS 2022** (use search in the win11 to find it). Before install it, make sure your environment has the necessary dependencies like `Pytorch` and `ninja`.

```bash
git clone https://github.com/NVIDIA/apex.git
cd apex
set APEX_CPP_EXT=1
set APEX_CUDA_EXT=1
set DISTUTILS_USE_SDK=1
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation ./
```

## Trouble shooting
If you encounter trouble with `compiled_autograd.h(1134 / 1108 / 1181)`, based on the [Pytorch issue #148317](https://github.com/pytorch/pytorch/issues/148317#issuecomment-3344732754), you may need to navigate to `\anaconda\envs\basic\lib\site-packages\torch\include\torch\csrc\dynamo\compiled_autograd.h`to Line 1134, and change it from:
```python
} else if constexpr (::std::is_same_v<T, ::std::string>) {
return at::StringType::get();
```
to
```python
// } else if constexpr (::std::is_same_v<T, ::std::string>) {
// return at::StringType::get();
```

---

> **Note:** Building NVIDIA Apex on Windows is challenging and may find different errors on different devices. This guide documents a successful build on Win11 RTX5070 (sm_120) with CUDA 12.8.

---

## Build Environment

| Component | Version |
|-----------|---------|
| **OS** | Windows 11 |
| **CUDA Toolkit** | 12.8 (Blackwell / SM_100 / SM_120) |
| **CUDA Path** | `E:\CUDA128` |
| **Compiler** | MSVC 2022 (Visual Studio Build Tools) |
| **Python** | 3.10 |
| **PyTorch** | 2.9.1+cu128 |
| **Build Flags** | `APEX_CPP_EXT=1`, `APEX_CUDA_EXT=1` |

### NVCC Version Info

```
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Wed_Jan_15_19:38:46_Pacific_Standard_Time_2025
Cuda compilation tools, release 12.8, V12.8.61
Build cuda_12.8.r12.8/compiler.35404655_0
```

---

## Summary of Changes

This patch addresses **three primary categories** of build failures encountered on Windows:

1. Standard type definitions
2. MSVC-specific compiler flags for memory alignment
3. Explicit library linking for cuBLAS

---

## 1. `setup.py` Configuration

### Changes

Added `libraries=["cublas", "cublasLt"]` and `extra_compile_args` with `-D_DISABLE_EXTENDED_ALIGNED_STORAGE` to several CUDA extensions.

### Affected Extensions

- `mlp_cuda`
- `fused_dense_cuda`
- `fused_weight_gradient_mlp_cuda`
- *(And potentially others using cuBLAS or aligned storage)*

### Code Diff

```python
ext_modules.append(
CUDAExtension(
name="module_name",
sources=["..."],
# Fix 1: Explicitly link cuBLAS for Windows
libraries=["cublas", "cublasLt"],
extra_compile_args={
# Fix 2: Disable extended aligned storage to fix VS2019+ static assertion errors
"cxx": ["-O3", "-D_DISABLE_EXTENDED_ALIGNED_STORAGE"],
"nvcc": ["-O3", "-D_DISABLE_EXTENDED_ALIGNED_STORAGE", ...],
},
)
)
```

### Reasoning

| Issue | Explanation |
|-------|-------------|
| **Linker Errors (`LNK2001`)** | Unlike Linux, the Windows build environment does not automatically link `cublas.lib` and `cublasLt.lib` when these headers are used. Explicit linking resolves unresolved external symbols for `cublasGemmEx`, `cublasLtMatmul`, etc. |
| **Alignment Errors** | Visual Studio 2017 (15.8 update) and later changed how `std::aligned_storage` works, causing compliance standard errors with older CUDA headers. The flag `_DISABLE_EXTENDED_ALIGNED_STORAGE` restores the necessary behavior for compilation to succeed. |

---

## 2. Source Code Fixes (`csrc/`)

### A. Type Definition Fix (`uint`)

**File:** `csrc/mlp_cuda.cu`

**Change:** Replaced `uint` with `unsigned int`.

**Reasoning:** The type alias `uint` is standard in Linux system headers but is **not defined** by default in the MSVC (Windows) environment. Using the standard C++ type `unsigned int` ensures cross-platform compatibility.

---

### B. Device Function Compatibility (`isfinite`)

**Files:**
- `csrc/multi_tensor_scale_kernel.cu`
- `csrc/multi_tensor_axpby_kernel.cu`

**Change:** Replaced the `isfinite()` check with a robust floating-point check using `fabsf`. Affected variables including `r_in[ii]`, `r_x[ii]` and `r_y[ii]`.

```cpp
// Before
finite = finite && (isfinite(r_in[ii])); ...

// After
finite = finite && (fabsf((float)r_in[ii]) <= 3.40282e+38f); ...
// Checks if value is within finite float range
```

**Reasoning:** On Windows NVCC, `isfinite` often resolves to the host-only C++ standard library function (`std::isfinite`) rather than the device intrinsic, causing a *"calling a host function from a device function"* error. Replacing it with `fabsf` (which is correctly mapped to a device intrinsic) bypasses this restriction while maintaining logical correctness.

---




## License

Follow the original [NVIDIA Apex License](https://github.com/NVIDIA/apex/blob/master/LICENSE).