[llvm] [AMDGPU] precision error observed after SIPeepholeSDWA optimization for fp16 (PR #109395)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 20 02:27:52 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pankaj Dwivedi (PankajDwivedi-25)
<details>
<summary>Changes</summary>
Without SDWA optimization
%124:vgpr_32 = V_LSHRREV_B32_e64 16, %121:vgpr_32, implicit $exec
%126:vgpr_32 = contract nofpexcept V_ADD_F16_e64 0, %121:vgpr_32, 0, %124:vgpr_32, 0, 0, implicit $mode, implicit $exec
%129:vgpr_32 = contract nofpexcept V_SUB_F16_e64 0, %121:vgpr_32, 0, %124:vgpr_32, 0, 0, implicit $mode, implicit $exec
With SDWA optimization
%124:vgpr_32 = V_LSHRREV_B32_e64 16, %121:vgpr_32, implicit $exec
%126:vgpr_32 = contract nofpexcept V_ADD_F16_sdwa 0, %121:vgpr_32, 0, %121:vgpr_32, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec
%129:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %121:vgpr_32, 0, %121:vgpr_32, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec
---
Full diff: https://github.com/llvm/llvm-project/pull/109395.diff
1 Files Affected:
- (added) llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll (+265)
``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll b/llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll
new file mode 100644
index 00000000000000..ffbb7f89de050b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll
@@ -0,0 +1,265 @@
+%struct.rocfft_complex = type { half, half }
+
+$_Z32real_post_process_kernel_inplaceI14rocfft_complexIDF16_ELb1EEvmmmPT_mPKS2_ = comdat any
+
+; Function Attrs: convergent inlinehint mustprogress nounwind
+define weak_odr hidden void @_Z32real_post_process_kernel_inplaceI14rocfft_complexIDF16_ELb1EEvmmmPT_mPKS2_(i64 noundef %0, i64 noundef %1, i64 noundef %2, ptr noundef %3, i64 noundef %4, ptr noundef %5) #2 comdat {
+ %7 = alloca i64, align 8, addrspace(5)
+ %8 = alloca i64, align 8, addrspace(5)
+ %9 = alloca i64, align 8, addrspace(5)
+ %10 = alloca ptr, align 8, addrspace(5)
+ %11 = alloca i64, align 8, addrspace(5)
+ %12 = alloca ptr, align 8, addrspace(5)
+ %13 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+ %14 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+ %15 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+ %16 = alloca double, align 8, addrspace(5)
+ %17 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+ %18 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+ %19 = alloca double, align 8, addrspace(5)
+ %20 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+ %21 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+ %22 = addrspacecast ptr addrspace(5) %7 to ptr
+ %23 = addrspacecast ptr addrspace(5) %8 to ptr
+ %24 = addrspacecast ptr addrspace(5) %9 to ptr
+ %25 = addrspacecast ptr addrspace(5) %10 to ptr
+ %26 = addrspacecast ptr addrspace(5) %11 to ptr
+ %27 = addrspacecast ptr addrspace(5) %12 to ptr
+ %28 = addrspacecast ptr addrspace(5) %13 to ptr
+ %29 = addrspacecast ptr addrspace(5) %14 to ptr
+ %30 = addrspacecast ptr addrspace(5) %15 to ptr
+ %31 = addrspacecast ptr addrspace(5) %16 to ptr
+ %32 = addrspacecast ptr addrspace(5) %17 to ptr
+ %33 = addrspacecast ptr addrspace(5) %18 to ptr
+ %34 = addrspacecast ptr addrspace(5) %19 to ptr
+ %35 = addrspacecast ptr addrspace(5) %20 to ptr
+ %36 = addrspacecast ptr addrspace(5) %21 to ptr
+ store i64 %0, ptr %22, align 8, !tbaa !6
+ store i64 %1, ptr %23, align 8, !tbaa !6
+ store i64 %2, ptr %24, align 8, !tbaa !6
+ store ptr %3, ptr %25, align 8, !tbaa !10
+ store i64 %4, ptr %26, align 8, !tbaa !6
+ store ptr %5, ptr %27, align 8, !tbaa !10
+ %37 = load i64, ptr %22, align 8, !tbaa !6
+ %38 = load i64, ptr %24, align 8, !tbaa !6
+ br label %40
+
+40: ; preds = %6
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %13) #4
+ %41 = load ptr, ptr %25, align 8, !tbaa !10
+ %42 = load i64, ptr %26, align 8, !tbaa !6
+ %43 = load i64, ptr %22, align 8, !tbaa !6
+ %44 = add i64 %42, %43
+ %45 = getelementptr inbounds %struct.rocfft_complex, ptr %41, i64 %44
+ call void @llvm.memcpy.p0.p0.i64(ptr align 2 %28, ptr align 2 %45, i64 4, i1 false), !tbaa.struct !12
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %14) #4
+ %46 = load ptr, ptr %25, align 8, !tbaa !10
+ %47 = load i64, ptr %26, align 8, !tbaa !6
+ %48 = load i64, ptr %23, align 8, !tbaa !6
+ %49 = add i64 %47, %48
+ %50 = getelementptr inbounds %struct.rocfft_complex, ptr %46, i64 %49
+ call void @llvm.memcpy.p0.p0.i64(ptr align 2 %29, ptr align 2 %50, i64 4, i1 false), !tbaa.struct !12
+ %51 = load i64, ptr %22, align 8, !tbaa !6
+ %52 = icmp eq i64 %51, 0
+ br i1 %52, label %53, label %102
+
+53: ; preds = %40
+ %54 = getelementptr inbounds %struct.rocfft_complex, ptr %28, i32 0, i32 0
+ %55 = load half, ptr %54, align 2, !tbaa !15
+ %56 = getelementptr inbounds %struct.rocfft_complex, ptr %28, i32 0, i32 1
+ %57 = load half, ptr %56, align 2, !tbaa !17
+ %58 = fadd contract half %55, %57
+ %59 = load ptr, ptr %25, align 8, !tbaa !10
+ %60 = load i64, ptr %26, align 8, !tbaa !6
+ %61 = load i64, ptr %22, align 8, !tbaa !6
+ %62 = add i64 %60, %61
+ %63 = getelementptr inbounds %struct.rocfft_complex, ptr %59, i64 %62
+ %64 = getelementptr inbounds %struct.rocfft_complex, ptr %63, i32 0, i32 0
+ store half %58, ptr %64, align 2, !tbaa !15
+ %65 = load ptr, ptr %25, align 8, !tbaa !10
+ %66 = load i64, ptr %26, align 8, !tbaa !6
+ %67 = load i64, ptr %22, align 8, !tbaa !6
+ %68 = add i64 %66, %67
+ %69 = getelementptr inbounds %struct.rocfft_complex, ptr %65, i64 %68
+ %70 = getelementptr inbounds %struct.rocfft_complex, ptr %69, i32 0, i32 1
+ store half 0xH0000, ptr %70, align 2, !tbaa !17
+ %71 = getelementptr inbounds %struct.rocfft_complex, ptr %28, i32 0, i32 0
+ %72 = load half, ptr %71, align 2, !tbaa !15
+ %73 = getelementptr inbounds %struct.rocfft_complex, ptr %28, i32 0, i32 1
+ %74 = load half, ptr %73, align 2, !tbaa !17
+ %75 = fsub contract half %72, %74
+ %76 = load ptr, ptr %25, align 8, !tbaa !10
+ %77 = load i64, ptr %26, align 8, !tbaa !6
+ %78 = load i64, ptr %23, align 8, !tbaa !6
+ %79 = add i64 %77, %78
+ %80 = getelementptr inbounds %struct.rocfft_complex, ptr %76, i64 %79
+ %81 = getelementptr inbounds %struct.rocfft_complex, ptr %80, i32 0, i32 0
+ store half %75, ptr %81, align 2, !tbaa !15
+ %82 = load ptr, ptr %25, align 8, !tbaa !10
+ %83 = load i64, ptr %26, align 8, !tbaa !6
+ %84 = load i64, ptr %23, align 8, !tbaa !6
+ %85 = add i64 %83, %84
+ %86 = getelementptr inbounds %struct.rocfft_complex, ptr %82, i64 %85
+ %87 = getelementptr inbounds %struct.rocfft_complex, ptr %86, i32 0, i32 1
+ store half 0xH0000, ptr %87, align 2, !tbaa !17
+ %88 = load ptr, ptr %25, align 8, !tbaa !10
+ %89 = load i64, ptr %26, align 8, !tbaa !6
+ %90 = load i64, ptr %24, align 8, !tbaa !6
+ %91 = add i64 %89, %90
+ %92 = getelementptr inbounds %struct.rocfft_complex, ptr %88, i64 %91
+ %93 = getelementptr inbounds %struct.rocfft_complex, ptr %92, i32 0, i32 1
+ %94 = load half, ptr %93, align 2, !tbaa !17
+ %95 = fneg contract half %94
+ %96 = load ptr, ptr %25, align 8, !tbaa !10
+ %97 = load i64, ptr %26, align 8, !tbaa !6
+ %98 = load i64, ptr %24, align 8, !tbaa !6
+ %99 = add i64 %97, %98
+ %100 = getelementptr inbounds %struct.rocfft_complex, ptr %96, i64 %99
+ %101 = getelementptr inbounds %struct.rocfft_complex, ptr %100, i32 0, i32 1
+ store half %95, ptr %101, align 2, !tbaa !17
+ ret void
+
+102: ; preds = %40
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %15) #4
+ call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %16) #4
+ store double 5.000000e-01, ptr %31, align 8, !tbaa !18
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %17) #4
+ store i32 0, ptr %32, align 2
+ store i32 0, ptr %30, align 2
+ call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %17) #4
+ call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %16) #4
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %18) #4
+ call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %19) #4
+ store double 5.000000e-01, ptr %34, align 8, !tbaa !18
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %20) #4
+ store i32 0, ptr %35, align 2
+ store i32 0, ptr %33, align 2
+ call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %20) #4
+ call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %19) #4
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %21) #4
+ %107 = load ptr, ptr %27, align 8, !tbaa !10
+ %108 = load i64, ptr %22, align 8, !tbaa !6
+ %109 = getelementptr inbounds %struct.rocfft_complex, ptr %107, i64 %108
+ call void @llvm.memcpy.p0.p0.i64(ptr align 2 %36, ptr align 2 %109, i64 4, i1 false), !tbaa.struct !12
+ %110 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 0
+ %111 = load half, ptr %110, align 2, !tbaa !15
+ %112 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 0
+ %113 = load half, ptr %112, align 2, !tbaa !15
+ %114 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 1
+ %115 = load half, ptr %114, align 2, !tbaa !17
+ %116 = fmul contract half %113, %115
+ %117 = fadd contract half %111, %116
+ %118 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 1
+ %119 = load half, ptr %118, align 2, !tbaa !17
+ %120 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 0
+ %121 = load half, ptr %120, align 2, !tbaa !15
+ %122 = fmul contract half %119, %121
+ %123 = fadd contract half %117, %122
+ %124 = load ptr, ptr %25, align 8, !tbaa !10
+ %125 = load i64, ptr %26, align 8, !tbaa !6
+ %126 = load i64, ptr %22, align 8, !tbaa !6
+ %127 = add i64 %125, %126
+ %128 = getelementptr inbounds %struct.rocfft_complex, ptr %124, i64 %127
+ %129 = getelementptr inbounds %struct.rocfft_complex, ptr %128, i32 0, i32 0
+ store half %123, ptr %129, align 2, !tbaa !15
+ %130 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 1
+ %131 = load half, ptr %130, align 2, !tbaa !17
+ %132 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 1
+ %133 = load half, ptr %132, align 2, !tbaa !17
+ %134 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 1
+ %135 = load half, ptr %134, align 2, !tbaa !17
+ %136 = fmul contract half %133, %135
+ %137 = fadd contract half %131, %136
+ %138 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 0
+ %139 = load half, ptr %138, align 2, !tbaa !15
+ %140 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 0
+ %141 = load half, ptr %140, align 2, !tbaa !15
+ %142 = fmul contract half %139, %141
+ %143 = fsub contract half %137, %142
+ %144 = load ptr, ptr %25, align 8, !tbaa !10
+ %145 = load i64, ptr %26, align 8, !tbaa !6
+ %146 = load i64, ptr %22, align 8, !tbaa !6
+ %147 = add i64 %145, %146
+ %148 = getelementptr inbounds %struct.rocfft_complex, ptr %144, i64 %147
+ %149 = getelementptr inbounds %struct.rocfft_complex, ptr %148, i32 0, i32 1
+ store half %143, ptr %149, align 2, !tbaa !17
+ %150 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 0
+ %151 = load half, ptr %150, align 2, !tbaa !15
+ %152 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 0
+ %153 = load half, ptr %152, align 2, !tbaa !15
+ %154 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 1
+ %155 = load half, ptr %154, align 2, !tbaa !17
+ %156 = fmul contract half %153, %155
+ %157 = fsub contract half %151, %156
+ %158 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 1
+ %159 = load half, ptr %158, align 2, !tbaa !17
+ %160 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 0
+ %161 = load half, ptr %160, align 2, !tbaa !15
+ %162 = fmul contract half %159, %161
+ %163 = fsub contract half %157, %162
+ %164 = load ptr, ptr %25, align 8, !tbaa !10
+ %165 = load i64, ptr %26, align 8, !tbaa !6
+ %166 = load i64, ptr %23, align 8, !tbaa !6
+ %167 = add i64 %165, %166
+ %168 = getelementptr inbounds %struct.rocfft_complex, ptr %164, i64 %167
+ %169 = getelementptr inbounds %struct.rocfft_complex, ptr %168, i32 0, i32 0
+ store half %163, ptr %169, align 2, !tbaa !15
+ %170 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 1
+ %171 = load half, ptr %170, align 2, !tbaa !17
+ %172 = fneg contract half %171
+ %173 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 1
+ %174 = load half, ptr %173, align 2, !tbaa !17
+ %175 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 1
+ %176 = load half, ptr %175, align 2, !tbaa !17
+ %177 = fmul contract half %174, %176
+ %178 = fadd contract half %172, %177
+ %179 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 0
+ %180 = load half, ptr %179, align 2, !tbaa !15
+ %181 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 0
+ %182 = load half, ptr %181, align 2, !tbaa !15
+ %183 = fmul contract half %180, %182
+ %184 = fsub contract half %178, %183
+ %185 = load ptr, ptr %25, align 8, !tbaa !10
+ %186 = load i64, ptr %26, align 8, !tbaa !6
+ %187 = load i64, ptr %23, align 8, !tbaa !6
+ %188 = add i64 %186, %187
+ %189 = getelementptr inbounds %struct.rocfft_complex, ptr %185, i64 %188
+ %190 = getelementptr inbounds %struct.rocfft_complex, ptr %189, i32 0, i32 1
+ store half %184, ptr %190, align 2, !tbaa !17
+ call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %21) #4
+ call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %18) #4
+ call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %15) #4
+ ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { convergent inlinehint mustprogress nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+cumode,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+sramecc,+wavefrontsize64,-xnack" }
+attributes #3 = { convergent mustprogress nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+cumode,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+sramecc,+wavefrontsize64,-xnack" }
+attributes #4 = { nounwind }
+attributes #5 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+!llvm.ident = !{!4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4}
+!opencl.ocl.version = !{!5, !5, !5, !5, !5, !5, !5, !5, !5, !5}
+
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 8, !"PIC Level", i32 2}
+!4 = !{!"clang version 19.0.0git (ssh://padivedi@gerrit-git.amd.com:29418/lightning/ec/llvm-project a2421f3d00e8e99003ddde4ce19939737b57d043)"}
+!5 = !{i32 2, i32 0}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"long", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"any pointer", !8, i64 0}
+!12 = !{i64 0, i64 2, !13, i64 2, i64 2, !13}
+!13 = !{!14, !14, i64 0}
+!14 = !{!"_Float16", !8, i64 0}
+!15 = !{!16, !14, i64 0}
+!16 = !{!"_ZTS14rocfft_complexIDF16_E", !14, i64 0, !14, i64 2}
+!17 = !{!16, !14, i64 2}
+!18 = !{!19, !19, i64 0}
+!19 = !{!"double", !8, i64 0}
``````````
</details>
https://github.com/llvm/llvm-project/pull/109395
More information about the llvm-commits
mailing list