<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/113525>113525</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
【clang】Calculation result error at O2 and higher optimization
</td>
</tr>
<tr>
<th>Labels</th>
<td>
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
hippyll123
</td>
</tr>
</table>
<pre>
I found a bug in llvm11 and it still exists in llvm18. The problem can be replicated by compiling with command: `clang++ 1.cpp -O2`, when use `-O1`, the result is expected. For a brief look, the problem is that `result_buf2` does not reload on the loop 2. It takes the data directly from the register and the register is not intended.
The test case is as follows:
```c++
#include <stdint.h>
#include <stdio.h>
#include <math.h>
constexpr int core_num = 32;
constexpr int ldm_total_size = 239616; // 234kb
constexpr int double_space = 2;
#define MAX_DIM 8
#define CDBUF(x) (x + (x##_size) * x##_dbflag)
#define ADBUF(x) (x + (1 - x##_dbflag) * (x##_size))
#define EXDBF(x) x##_dbflag = 1 - x##_dbflag
#define ALIGN_N(v, n) (((unsigned long long)(v)) % n == 0)
typedef struct FusedYoloPostprocessArgs {
int slave_core_num;
int slave_group_num;
unsigned dataType;
float mul_0_0;
float sub_0_1;
void *add_0_2;
float mul_0_3;
float mul_1_0;
float exponent_1_1;
void *mul_1_2;
void *x;
void *y;
int dim_x[MAX_DIM];
int dim_add_0_2[MAX_DIM];
int dim_mul_1_2[MAX_DIM];
int dim_y[MAX_DIM];
int dim_size;
int enable_add_0_2_broadcast;
int enable_mul_1_2_broadcast;
int disable_sub_0_1;
int disable_mul_0_3;
int disable_mul_1_0;
} FusedYoloPostprocessArgs;
template <typename T, typename U>
static __inline__ __attribute__((always_inline)) T MIN(const T &a, const U &b) {
return a < b ? a : b;
}
static constexpr __inline__ __attribute__((always_inline)) int floor(
const int &num, const int &unit = 32) {
return (num / unit) * unit;
}
template <typename IN_T, typename OUT_T>
void teco_custom_slave_fused_yolo_postprocess_impl(FusedYoloPostprocessArgs arg) {
int threadIdx = 0;
int tid = 0;
int NHW = 1;
for (int i = 0; i < arg.dim_size - 1; i++) {
NHW *= arg.dim_x[i];
}
int C = arg.dim_x[arg.dim_size - 1];
float mul_0_0 = arg.mul_0_0;
float sub_0_1 = arg.sub_0_1;
OUT_T *add_0_2 = (OUT_T *)arg.add_0_2;
float mul_0_3 = arg.mul_0_3;
float mul_1_0 = arg.mul_1_0;
OUT_T *mul_1_2 = (OUT_T *)arg.mul_1_2;
IN_T *x = (IN_T *)arg.x;
OUT_T *y = (OUT_T *)arg.y;
int disable_sub_0_1 = arg.disable_sub_0_1;
int disable_mul_0_3 = arg.disable_mul_0_3;
int disable_mul_1_0 = arg.disable_mul_1_0;
int ldm_used_size = sizeof(float) * 4; // mul_0_0, exponent_1_1, result_buf
int NHW_per_core = NHW / core_num + (NHW % core_num ? 1 : 0);
int max_bNHW =
(ldm_total_size - ldm_used_size) /
((C * (sizeof(IN_T) + sizeof(OUT_T)) + 2 * (sizeof(OUT_T) + sizeof(OUT_T))) * double_space);
int bNHW = MIN(max_bNHW, NHW_per_core);
// align 2x
bNHW = (bNHW < 2) ? bNHW : floor(bNHW, 2);
int x_buf_size = bNHW * C;
int y_buf_size = bNHW * C;
int add_buf_size = bNHW * 2;
int mul_buf_size = bNHW * 2;
float *result_buf = (float *)malloc(4 * sizeof(float));
IN_T *x_buf = (IN_T *)malloc(x_buf_size * sizeof(IN_T) * double_space);
OUT_T *add_buf = (OUT_T *)malloc(add_buf_size * sizeof(OUT_T) * double_space);
OUT_T *mul_buf = (OUT_T *)malloc(mul_buf_size * sizeof(OUT_T) * double_space);
int x_buf_dbflag = 0, y_buf_dbflag = 0, add_buf_dbflag = 0, mul_buf_dbflag = 0;
int index_NHW = tid * bNHW;
int curr_NHW = MIN(bNHW, NHW - index_NHW); // 4
if (curr_NHW > 0) {
for (int tt = 0; tt < curr_NHW * C; tt++) {
*(x_buf + tt) = 1.0f;
}
if (threadIdx == 0) {
printf("bNHW = %d, ", bNHW);
printf("C = %d, bNEF = %d, curr_NHW = %d\n", C, bNHW, curr_NHW);
}
for (int index = 0; index < curr_NHW; index++) { // curr_NHW = 4
float result_buf2 = (float)(*(CDBUF(x_buf) + index * C + 2)) * mul_1_0; // x_buf + x_buf_size * x_buf_dbflag
float result_buf3 = (float)(*(CDBUF(x_buf) + index * C + 3)) * mul_1_0;
result_buf2 *= result_buf2;
result_buf3 *= result_buf3;
(*(CDBUF(x_buf) + index * C)) = 2.0f;
(*(CDBUF(x_buf) + index * C + 2)) = (OUT_T)result_buf2;
(*(CDBUF(x_buf) + index * C + 3)) = (OUT_T)result_buf3;
}
asm volatile("nop\n\tnop\n\tnop");
if (threadIdx == 0) {
for (int index = 0; index < curr_NHW; index++) {
printf("loop %d:\n", index);
printf("x_buf[%d] = %f\n", C * index + 0, (float)(*(CDBUF(x_buf) + C * index + 0)));
printf("x_buf[%d] = %f\n", C * index + 2, (float)(*(CDBUF(x_buf) + C * index + 2)));
printf("x_buf[%d] = %f\n", C * index + 3, (float)(*(CDBUF(x_buf) + C * index + 3)));
}
}
asm volatile("nop\n\tnop");
}
EXDBF(x_buf);
EXDBF(add_buf);
EXDBF(mul_buf);
free(x_buf);
free(add_buf);
free(mul_buf);
free(result_buf);
}
void teco_custom_slave_fused_yolo_postprocess(FusedYoloPostprocessArgs args) {
teco_custom_slave_fused_yolo_postprocess_impl<float, float>(args);
}
int main() {
printf("main fun\n");
FusedYoloPostprocessArgs args;
args.dim_size = 2;
args.dim_x[0] = 160;
args.dim_x[1] = 80
args.mul_0_0 = 1.0f;
args.sub_0_1 = 0.0f;
args.mul_0_3 = 1.0f;
args.mul_1_0 = 2.0f;
args.disable_sub_0_1 = 1;
args.disable_mul_0_3 = 1;
args.disable_mul_1_0 = 0;
teco_custom_slave_fused_yolo_postprocess(args);
return 0;
}
```
the expectant result is:
```
main fun
bNHW = 4, C = 80, bNEF = 4, curr_NHW = 4
loop 0:
x_buf[0] = 2.000000
x_buf[2] = 4.000000
x_buf[3] = 4.000000
loop 1:
x_buf[80] = 2.000000
x_buf[82] = 4.000000
x_buf[83] = 4.000000
loop 2:
x_buf[160] = 2.000000
x_buf[162] = 4.000000
x_buf[163] = 4.000000
loop 3:
x_buf[240] = 2.000000
x_buf[242] = 4.000000
x_buf[243] = 4.000000
```
when use -O2 or higher optimization the result is:
```
main fun
bNHW = 4, C = 80, bNEF = 4, curr_NHW = 4
loop 0:
x_buf[0] = 2.000000
x_buf[2] = 4.000000
x_buf[3] = 4.000000
loop 1:
x_buf[80] = 2.000000
x_buf[82] = 64.000000
x_buf[83] = 4.000000
loop 2:
x_buf[160] = 2.000000
x_buf[162] = 64.000000
x_buf[163] = 4.000000
loop 3:
x_buf[240] = 2.000000
x_buf[242] = 64.000000
x_buf[243] = 4.000000
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWk9zozoS_zTKpSsuELYHH3ywnXg3VTuTPWRq354oAbKtHYEoEBn7ffotCYQFCDt5s29PL5WyQd3qP-runxphUlXsmFO6RostWjw9kFqeRLk-saK4cO7j4CEW6WX9AgdR5ykQiOsjsBw4f898H0ieApNQScY50DOrZNVRwxm8nSgUpYg5zSAhOcQUSlpwlhBJU4gvkIisYJzlR_jJ5EndZiRPUbABtPQSTvIjwluEt-DPkqKAx1eMlh7CO_h5ojnUFVV8j69-OypPSkFVcwmsAnouaCJpOoO9KJXpJaMH4EL8MLzGNlaBPBGphDXTo7g-KFWQClpBLiSUlAuSgsj1RC5EAXgGLxIk-UErPZgSSSBlJU0kv8ChFFlr0JFVkpZ6sXoDrBHNcknzlKYz5D0hb9N8qqWTtJKQkIoqTlLBQXAuflYo2Nicynf9nzRr1Y7igOUJr1MKKNhVMmW5nJ1Q8DxFFpPUjMiTTdSficgrSc9FqcyHRJQ0yusMUPAEAUbB1sXF0yySQhIeVex3qnlxsFr6SxRsARDeI7wHHMx_xK7ZqahjTqOqIEk7t1PTWZ3SA8spfN38Fj29fAUIh5Td0_b7HuHwjPAK1Deo7GoGAoQDbVlD24AZS-MDJ0eEV0NpmylpPjyOZ2uZY1UOuc-_PW07uX052nOH-KFl_3j527foG8Lhu0r2vDWw-a9zXfQpcJEf9Yc2Q7GuGsYF5EqR0uVd7dOf8lLQlB6gkmWdSNjXFU3_Lbj4p6hkUYqEVtWmPFaAvrTBAQAdv4qTdxqZTOli1ycfS1EXI3pnsCqxt0tBe9QDF0RCVvPIizwHparjyIv8HuVdsFTFg6Rp5EV4Ul4wQfGdmui5EDnNZeRPqGumYift7By9jBYqZVl0Rottm-Ro8eRkMZ7dZTQ23WW83GfRKT0k0Jyowm0NiuJSkDQhlZxibA26wZiySnO6ImvTXTEc0u1Ioi9Pkwk9ABtJs4ITqQFSlUROMgpvemMxd987yKwkkSyBKGI5ZzmNIogiImXJ4lrSKGqqkvCf5FK1LG0lvsHXF1XEGgnhDRBeEqWjuf-u7mNdsXaxlVTWZQ5EmQYxoGCvrzcQ247azrT2XfH285aqZT1wIUrF0pnS2KloCC9VVXe2t2N1zqTZNCb8QDjUOwveg-I2QKqvJ_xxBuflW9SPz-v3t-iti5EuOEkTESV1JUUWNYB0UOkQXQQXUXFNiIhlBUc4nEQ_Uh5H_iiX5amkJH1Jz9Bg6zAzpap6J-Xb3__VYP8gETX6iFItk2Jj3XR9uVOWzExpwqOeDqztFIYWqj-tB2-UFDNVoQ0blvxgyY2ZOxjOHBkwENSD7272XTjvOF0goENr4btmRjjsxhFeqbl34X9gz43toMc53B46vS22Tdnj2h9U3ur9wUwyA-2cs1vTZUrHxZFADlC14vgpsB3N-yAIO-cN19F0kboouyZSXYgDwqEOiAGIud1VmnTCu_4mjXdw7fhHBRcVtNQdi1bTVMbeanabZq8ZX9hN8B58jbi6eRo6kJFzFLfl3K89hMNBj_zYd7fxbT-ahXC4M-1ltxoN4K20md1gA3qmz9sCHk0zHJPzzALbDbnLT-Nju4cZt9Wa22s7nNpGjHB2zAGfr4ROHsJhe72DZtsI9oa66XYhowu7bDureF8zKG5RD3YjzsuHORWYuHnxOAdqfpd3CDQIb665ahaiIyG8ygjnIkE4nGtJo7IYLkSHLLZAC106efZq2YKvGXY7G3pwbOmyoalT1l9GW52VmR_U1y7zTX39UHxanyuvrEc1DTkX56hxdDhuDOqNDxOI5Sk9R6YiZPOwoLNoxJrUZRn1a9GqQ3i8ymr86kpwPvLwoFbRkvesEW7cRFgNiZTXjkRf7yyD2joCKW90JA0obEwealySDcyrfmjmHXouTzQnV_t7PZh5xHWrLUqWy4MGWGzBzyJVi4cwVl9xt3C35-96k-Nvz_veQC9KenCxy1sVu6ueK6NL54Tfdn-oYm31iO2tJdaM9iJiAXPP0PnY5waRrHO0HlQ1Jw1NOLvDGL35tjtOa5HKjGaDsnaca0tgjLlmxACj7EK8b2PwazYGThtdkQD9XGMtTdNp26eOrjzqWTqcEtzQ9WE_jAfBE2BXRX1KWD9wFvYivLrn6h9c-SklwUdrhFQZvAtOJOO0KdhcFLoGFzs5uMSjnuIPocuvl-VY5hB19Em1hpNgYyFKK8qNWkMZTQQW2waVngxCHWyE0jEx0dk2G9mH68kxe-XoV_6H9uFfsg__6fYFv2RfcNu-rgImB-4Ww7gCHGXVHWO3tvb4DbHtgqbIbTPk7rgOJaVT8lvahPiW6pBuUa0nQ1t939FPnRvdOTKqRmX9uQOpYNfmy67Z4VDwrJagETzlQPNIynId54F6O4sVDxzqvMva_prd9mscOzV8PRfqv8_p0c9osfVM2fhLb5rLN1yhN-CwT5dGDaPmsE89PCeHfb7hlmGfZIw20dbS8RmLP83WU3mbzeh1NR6fyM5hrsD1HNZzJJB59dg7fD3R9tUryeX1dezUW8vmtksufRdf28sGG5uY2m3zfNQzt62o3vC8TpvB3i6D8MzTf30yNuS5kxy4yVqXP9IV3lEW3tEW3lKHR-pUTdzU5y_vKPSXtzQGI414fm8553c04vmExhuJ1b3zf3zFIEo4seOJliAKyTL2O5GsfTn_V8rdSrnl_z3nJjT-mUk3oXI661z59pCug3QVrMgDXftf8OrLfOUvVg-ntRdSf0XDNKVh8AV7yYosE-p7AfYXYUCXyQNbYw_PfQ_PvcUCB-EsTMLgsMBk5R3SFHsHNPdoRhifcf6ezUR5fGBVVdO17wcLvHjgJKa8Mr_JKdeK6zGujxWae5xVsrrOk0xyukbPAQo9tGp_M9Pe-TvCk5o3ldFWBS1LUQKR8Ir1D1IcRfRQl3x9krLQBaSfs49Mnup4logM4b3S3X49FqX4D00kwnvtQIXwvvXhfY3_GwAA__9nlee_">