HLSL / Direct3D 11  Why does unrolling a loop affect the accuracy of the computations within?
Note This question is a duplicate of my original question at StackOverflow; https://stackoverflow.com/questions/71818661/doesunrollingaloopaffecttheaccuracyofthecomputationswithin. However, I believe it might be related to Direct3D 11, and I am therefore reaching out here as well.
Summarized question Does unrolling a loop affect the accuracy of the computations performed within the loop? And if so, why?
Elaboration and background I am writing a compute shader using HLSL for use in a Unityproject (2021.2.9f1). Parts of my code include numerical procedures and highly osciallatory functions, meaning that high computational accuracy is essential.
When comparing my results with an equivalent procedure in Python, I noticed that some deviations in the order of 1e5. This was concerning, as I did not expect such large errors to be the result of precision differences, e.g., the floatprecision in trigonometric or power functions in HLSL.
Ultimatley, after much debugging, I now believe the choice of unrolling or not unrolling a loop to be the cause of the deviation. However, I do find this strange, as I can not seem to find any sources indicating that unrolling a loop affects the accuracy in addition to the "space–time tradeoff".
For clarification, if considering my Python results as the correct solution, unrolling the loop in HLSL gives me better results than what not unrolling gives.
Minimal working example Below is an MWE consisting of a C# script for Unity, the corresponding compute shader where the computations are performed and a screenshot of my console when running in Unity (2021.2.9f1). Forgive me for a somewhat messy implementation of Newtons method, but I chose to keep it since I believe it might be a cause to this deviation. That is, if simply computing cos(x), then there is not difference between the unrolled and not unrolled. None the less, I still fail to understand how the simple addition of [unroll(N)] in the testing kernel changes the result...
// C# for Unity
using UnityEngine;
public class UnrollTest : MonoBehaviour
{
[SerializeField] ComputeShader CS;
ComputeBuffer CBUnrolled, CBNotUnrolled;
readonly int N = 3;
private void Start()
{
CBUnrolled = new ComputeBuffer(N, sizeof(double));
CBNotUnrolled = new ComputeBuffer(N, sizeof(double));
CS.SetBuffer(0, "_CBUnrolled", CBUnrolled);
CS.SetBuffer(0, "_CBNotUnrolled", CBNotUnrolled);
CS.Dispatch(0, (int)((N + (64  1)) / 64), 1, 1);
double[] ansUnrolled = new double[N];
double[] ansNotUnrolled = new double[N];
CBUnrolled.GetData(ansUnrolled);
CBNotUnrolled.GetData(ansNotUnrolled);
for (int i = 0; i < N; i++)
{
Debug.Log("Unrolled ans = " + ansUnrolled[i] +
"  Not Unrolled ans = " + ansNotUnrolled[i] +
"  Difference is: " + (ansUnrolled[i]  ansNotUnrolled[i]));
}
CBUnrolled.Release();
CBNotUnrolled.Release();
}
}

// Compute Shader
#pragma kernel CSMain
RWStructuredBuffer<double> _CBUnrolled, _CBNotUnrolled;
// Dummy function for Newtons method
double fDummy(double k, double fnh, double h, double theta)
{
return fnh * fnh * k * h * cos(theta) * cos(theta)  (double) tanh(k * h);
}
// Derivative of Dummy function above using a central finite difference scheme.
double dfDummy(double k, double fnh, double h, double theta)
{
return (fDummy(k + (double) 1e3, fnh, h, theta)  fDummy(k  (double) 1e3, fnh, h, theta)) / (double) 2e3;
}
// Function to solve.
double f(double fnh, double h, double theta)
{
// Solved using Newton's method.
int max_iter = 50;
double epsilon = 1e8;
double fxn, dfxn;
// Define initial guess for k, herby denoted as x.
double xn = 10.0;
for (int n = 0; n < max_iter; n++)
{
fxn = fDummy(xn, fnh, h, theta);
if (abs(fxn) < epsilon) // A solution is found.
return xn;
dfxn = dfDummy(xn, fnh, h, theta);
if (dfxn == 0.0) // No solution found.
return xn;
xn = xn  fxn / dfxn;
}
// No solution found.
return xn;
}
[numthreads(64,1,1)]
void CSMain(uint3 threadID : SV_DispatchThreadID)
{
int N = 3;
// 
double fnh = 0.9, h = 4.53052, theta = 0.161, dtheta = 0.01; // Example values.
for (int i = 0; i < N; i++) // Not being unrolled
{
_CBNotUnrolled[i] = f(fnh, h, theta);
theta += dtheta;
}
// 
fnh = 0.9, h = 4.53052, theta = 0.161, dtheta = 0.01; // Example values.
[unroll(N)] for (int j = 0; j < N; j++) // Being unrolled.
{
_CBUnrolled[j] = f(fnh, h, theta);
theta += dtheta;
}
}
Edit After some more testing, the deviation has been narrowed down to the following code, giving a difference of about 1e17 between the exact same code unrolled vs not unrolled. Despite the small difference, I still consider it a valid example of the issue, as I believe they should be equal.
[numthreads(64, 1, 1)]
void CSMain(uint3 threadID : SV_DispatchThreadID)
{
if ((int) threadID.x != 1)
return;
int N = 3;
double k = 1.0;
// 
double fnh = 0.9, h = 4.53052, theta = 0.161, dtheta = 0.01; // Example values.
for (int i = 0; i < N; i++) // Not being unrolled
{
_CBNotUnrolled[i] = (k + (double) 1e3) * theta  (k  (double) 1e3) * theta;
theta += dtheta;
}
// 
fnh = 0.9, h = 4.53052, theta = 0.161, dtheta = 0.01; // Example values.
[unroll(N)]
for (int j = 0; j < N; j++) // Being unrolled.
{
_CBUnrolled[j] = (k + (double) 1e3) * theta  (k  (double) 1e3) * theta;
theta += dtheta;
}
}
Edit 2 The following is the compiled code for the kernel given in Edit 1. Unfortunately, my experience with assembly language is limited, and I am not capable of spotting if this script shows any errors, or if it is useful to the problem at hand.
**** Platform Direct3D 11:
Compiled code for kernel CSMain
keywords: <none>
binary blob size 648:
//
// Generated by Microsoft (R) D3D Shader Disassembler
//
//
// Note: shader requires additional functionality:
// Doubleprecision floating point
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
//       
// no Input
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
//       
// no Output
cs_5_0
dcl_globalFlags refactoringAllowed  enableDoublePrecisionFloatOps
dcl_uav_structured u0, 8
dcl_uav_structured u1, 8
dcl_input vThreadID.x
dcl_temps 2
dcl_thread_group 64, 1, 1
0: ine r0.x, vThreadID.x, l(1)
1: if_nz r0.x
2: ret
3: endif
4: dmov r0.xy, d(0.161000l, 0.000000l)
5: mov r0.z, l(0)
6: loop
7: ige r0.w, r0.z, l(3)
8: breakc_nz r0.w
9: dmul r1.xyzw, r0.xyxy, d(1.001000l, 0.999000l)
10: dadd r1.xy, r1.zwzw, r1.xyxy
11: store_structured u1.xy, r0.z, l(0), r1.xyxx
12: dadd r0.xy, r0.xyxy, d(0.010000l, 0.000000l)
13: iadd r0.z, r0.z, l(1)
14: endloop
15: store_structured u0.xy, l(0), l(0), l(0.000000,0.707432,0,0)
16: store_structured u0.xy, l(1), l(0), l(0.000000,0.702312,0,0)
17: store_structured u0.xy, l(2), l(0), l(918250586112.000000,0.697192,0,0)
18: ret
// Approximately 0 instruction slots used