[llvm] [AMDGPU] Allow forming overflow op and folding abd to usubo if it is legal. (PR #156266)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 31 17:24:38 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: AZero13 (AZero13)
<details>
<summary>Changes</summary>
Because usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible.
---
Patch is 221.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156266.diff
46 Files Affected:
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+4)
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll (+14-13)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll (+27-27)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ceil.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-exp2.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-fabs.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-floor.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log10.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-log2.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rint.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-round.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-trunc.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll (+7-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll (+7-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll (+6-5)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll (+8-6)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll (+10-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-none.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll (+25-1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll (+25-1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll (+13-22)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+24-48)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+24-48)
- (modified) llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+222-237)
- (modified) llvm/test/CodeGen/AMDGPU/sad.ll (+13-13)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 438b6ff55c85f..1d8cea1a14c03 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3455,6 +3455,10 @@ class LLVM_ABI TargetLoweringBase {
/// matching of other patterns.
virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool MathUsed) const {
+ // Form it if it is legal.
+ if (isOperationLegal(Opcode, VT))
+ return true;
+
// TODO: The default logic is inherited from code in CodeGenPrepare.
// The opcode should not make a difference by default?
if (Opcode != ISD::UADDO)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a8c7c16e2fa22..d13011019f3d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9788,7 +9788,8 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
// flag if the (scalar) type is illegal as this is more likely to legalize
// cleanly:
// abdu(lhs, rhs) -> sub(xor(sub(lhs, rhs), uof(lhs, rhs)), uof(lhs, rhs))
- if (!IsSigned && VT.isScalarInteger() && !isTypeLegal(VT)) {
+ if (!IsSigned && (isOperationLegal(ISD::USUBO, VT) ||
+ (VT.isScalarInteger() && !isTypeLegal(VT)))) {
SDValue USubO =
DAG.getNode(ISD::USUBO, dl, DAG.getVTList(VT, MVT::i1), {LHS, RHS});
SDValue Cmp = DAG.getNode(ISD::SIGN_EXTEND, dl, VT, USubO.getValue(1));
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 42c7b90da63d3..28f55511ebb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -169,6 +169,6 @@ attributes #1 = { nounwind }
;.
; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 7e9cb7adf4fc2..181dab8d4ca79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -105,7 +105,7 @@ declare void @unknown()
define amdgpu_kernel void @kernel_calls_extern() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
-; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
; CHECK-NEXT: call void @unknown()
; CHECK-NEXT: ret void
;
@@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
-; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]]
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
; CHECK-NEXT: ret void
;
call void @unknown() #0
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
; CHECK-NEXT: call void [[INDIRECT]]()
; CHECK-NEXT: ret void
;
@@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]]
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR7]]
; CHECK-NEXT: ret void
;
call void %indirect() #0
@@ -254,11 +254,12 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
attributes #0 = { "amdgpu-agpr-alloc"="0" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
index 03c84d1193609..f8a38572c1544 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
@@ -6,7 +6,7 @@
define amdgpu_kernel void @noop_sqrt_fpmath(ptr addrspace(1) %out, float %x) #0 {
; CHECK-LABEL: define amdgpu_kernel void @noop_sqrt_fpmath
; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META0:![0-9]+]]
; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
@@ -20,9 +20,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2:![0-9]+]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -51,7 +51,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -88,9 +88,9 @@ define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float>
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
@@ -149,7 +149,7 @@ define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float>
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
@@ -206,7 +206,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub(ptr addrspace(1) %out, fl
; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -243,9 +243,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out,
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -274,7 +274,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out,
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -311,9 +311,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1)
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -342,7 +342,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1)
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -379,9 +379,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrsp
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -410,7 +410,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrsp
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile floa...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/156266
More information about the llvm-commits
mailing list