[llvm] AMDGPU: Move libcall simplify into PeepholeEP (PR #88853)

Tue Apr 16 01:17:31 PDT 2024

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/88853

>From 46e4256274881a2ef409b19c77fcf3f6a1ead6b4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 16 Apr 2024 09:37:55 +0200
Subject: [PATCH 1/2] AMDGPU: Add baseline test for sincos phase ordering

---
 .../amdgpu-libcall-sincos-pass-ordering.ll    | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
new file mode 100644
index 00000000000000..e9525cdd8769ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 -amdgpu-prelink %s | FileCheck %s
+
+; Make sure that sin+cos -> sincos simplification happens after
+; initial IR simplifications, otherwise we can't identify the common
+; argument value.
+
+ at .str = private unnamed_addr addrspace(4) constant [21 x i8] c"x: %f, y: %f, z: %f\0A\00", align 1
+
+; Should have call to sincos declarations, not calls to the asm pseudo-libcalls
+define protected amdgpu_kernel void @swdev456865(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, float noundef %x) #0 {
+; CHECK-LABEL: define protected amdgpu_kernel void @swdev456865(
+; CHECK-SAME: ptr addrspace(1) nocapture writeonly [[OUT0:%.*]], ptr addrspace(1) nocapture writeonly [[OUT1:%.*]], ptr addrspace(1) nocapture writeonly [[OUT2:%.*]], float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I_I:%.*]] = tail call float asm "pseudo-libcall-sin [[TMP0:%.*]], %1", "=v,v"(float noundef [[X]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[I_I1:%.*]] = tail call float asm "pseudo-libcall-cos [[TMP0]], %1", "=v,v"(float noundef [[X]]) #[[ATTR1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[I_I]], [[I_I1]]
+; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X]] to double
+; CHECK-NEXT:    [[CONV5:%.*]] = fpext float [[ADD]] to double
+; CHECK-NEXT:    store double [[CONV]], ptr addrspace(1) [[OUT0]], align 8
+; CHECK-NEXT:    store double [[CONV5]], ptr addrspace(1) [[OUT1]], align 8
+; CHECK-NEXT:    store double [[CONV5]], ptr addrspace(1) [[OUT2]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.addr = alloca float, align 4, addrspace(5)
+  %y = alloca float, align 4, addrspace(5)
+  %z = alloca float, align 4, addrspace(5)
+  store float %x, ptr addrspace(5) %x.addr, align 4
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %y)
+  %i = load float, ptr addrspace(5) %x.addr, align 4
+  %call = call float @_Z3sinf(float noundef %i) #3
+  %i1 = load float, ptr addrspace(5) %x.addr, align 4
+  %call1 = call float @_Z3cosf(float noundef %i1) #3
+  %add = fadd float %call, %call1
+  store float %add, ptr addrspace(5) %y, align 4
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %z)
+  %i2 = load float, ptr addrspace(5) %x.addr, align 4
+  %call2 = call float @_Z3cosf(float noundef %i2) #3
+  %i3 = load float, ptr addrspace(5) %x.addr, align 4
+  %call3 = call float @_Z3sinf(float noundef %i3) #3
+  %add4 = fadd float %call2, %call3
+  store float %add4, ptr addrspace(5) %z, align 4
+  %i4 = load float, ptr addrspace(5) %x.addr, align 4
+  %conv = fpext float %i4 to double
+  %i5 = load float, ptr addrspace(5) %y, align 4
+  %conv5 = fpext float %i5 to double
+  %i6 = load float, ptr addrspace(5) %z, align 4
+  %conv6 = fpext float %i6 to double
+  store double %conv, ptr addrspace(1) %out0, align 8
+  store double %conv5, ptr addrspace(1) %out1, align 8
+  store double %conv6, ptr addrspace(1) %out2, align 8
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %z)
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %y)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) #1
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1
+
+define internal float @_Z3cosf(float noundef %arg) #2 {
+bb:
+  %i = tail call float asm "pseudo-libcall-cos %0, %1", "=v,v"(float noundef %arg) #2
+  ret float %i
+}
+
+define internal float @_Z3sinf(float noundef %arg) #2 {
+bb:
+  %i = tail call float asm "pseudo-libcall-sin %0, %1", "=v,v"(float noundef %arg) #2
+  ret float %i
+}
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { mustprogress nofree norecurse nounwind willreturn memory(none) }
+attributes #3 = { nounwind willreturn memory(none) }

>From 0adb3eb24eeb985a7b9b701979eadc4ad3331f02 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 16 Apr 2024 09:36:26 +0200
Subject: [PATCH 2/2] AMDGPU: Move libcall simplify into PeepholeEP

We were running this immediately on the incoming IR, which
is still littered with temporary allocas obscuring trivial values.
This needs to run after initial SROA to handle sincos insertion.

Fixes: SWDEV-456865
---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp      | 13 ++++++++++---
 .../AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll   |  7 ++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f7e552177d6f50..305a6c8c3b9262 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -655,9 +655,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
   PB.registerPipelineStartEPCallback(
       [](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
-        FPM.addPass(AMDGPUUseNativeCallsPass());
-        if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
-          FPM.addPass(AMDGPUSimplifyLibCallsPass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         if (EnableHipStdPar)
           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
@@ -681,6 +678,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
           PM.addPass(AMDGPUAlwaysInlinePass());
       });
 
+  PB.registerPeepholeEPCallback(
+      [](FunctionPassManager &FPM, OptimizationLevel Level) {
+        if (Level == OptimizationLevel::O0)
+          return;
+
+        FPM.addPass(AMDGPUUseNativeCallsPass());
+        if (EnableLibCallSimplify)
+          FPM.addPass(AMDGPUSimplifyLibCallsPass());
+      });
+
   PB.registerCGSCCOptimizerLateEPCallback(
       [this](CGSCCPassManager &PM, OptimizationLevel Level) {
         if (Level == OptimizationLevel::O0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
index e9525cdd8769ab..6b835bb4eef662 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
@@ -12,9 +12,10 @@ define protected amdgpu_kernel void @swdev456865(ptr addrspace(1) %out0, ptr add
 ; CHECK-LABEL: define protected amdgpu_kernel void @swdev456865(
 ; CHECK-SAME: ptr addrspace(1) nocapture writeonly [[OUT0:%.*]], ptr addrspace(1) nocapture writeonly [[OUT1:%.*]], ptr addrspace(1) nocapture writeonly [[OUT2:%.*]], float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I_I:%.*]] = tail call float asm "pseudo-libcall-sin [[TMP0:%.*]], %1", "=v,v"(float noundef [[X]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[I_I1:%.*]] = tail call float asm "pseudo-libcall-cos [[TMP0]], %1", "=v,v"(float noundef [[X]]) #[[ATTR1]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[I_I]], [[I_I1]]
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[I_I:%.*]] = call float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[I_I2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[I_I]], [[I_I2]]
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X]] to double
 ; CHECK-NEXT:    [[CONV5:%.*]] = fpext float [[ADD]] to double
 ; CHECK-NEXT:    store double [[CONV]], ptr addrspace(1) [[OUT0]], align 8