[llvm] [AMDGPU] Remove setcc by using add/sub carryout (PR #155255)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 25 07:56:00 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (LU-JOHN)
<details>
<summary>Changes</summary>
Remove setcc instruction by utilizing add/sub carryout.
Addresses https://github.com/llvm/llvm-project/issues/152992.
---
Patch is 111.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155255.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+56)
- (added) llvm/test/CodeGen/AMDGPU/addsub64_carry.ll (+177)
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+34-46)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+187-191)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+161-165)
- (modified) llvm/test/CodeGen/AMDGPU/uaddsat.ll (+21-26)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+158-162)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+106-109)
- (modified) llvm/test/CodeGen/AMDGPU/usubsat.ll (+20-34)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 66c1dfc71c2f5..e38af343beeaf 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15972,6 +15972,62 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
}
}
+ // Eliminate setcc by using carryout from add/sub instruction
+
+ // X = ADD i64 Y, Z Xlo = UADDO i32 Ylo, Zlo
+ // setcc X ult Y -> XHi = UADDO_CARRY i32 Yhi, Zhi
+ // similarly for subtraction
+
+ // X = ADD i64 Y, 1 Xlo = UADDO i32 Ylo, 1
+ // setcc X eq 0 -> XHi = UADDO_CARRY i32 Yhi, 0
+
+ // Don't split a 64-bit add/sub into two 32-bit add/sub instructions for
+ // non-divergent operations. This can result in lo/hi 32-bit operations
+ // being done in SGPR and VGPR with additional operations being needed
+ // to move operands and/or generate the intermediate carry.
+ if (VT == MVT::i64 && N->isDivergent() &&
+ ((((LHS.getOpcode() == ISD::ADD && CC == ISD::SETULT) ||
+ (LHS.getOpcode() == ISD::SUB && CC == ISD::SETUGT)) &&
+ LHS.getOperand(0) == RHS) ||
+ (LHS.getOpcode() == ISD::ADD && CC == ISD::SETEQ && CRHS &&
+ CRHS->isZero() && dyn_cast<ConstantSDNode>(LHS.getOperand(1)) &&
+ dyn_cast<ConstantSDNode>(LHS.getOperand(1))->isOne()))) {
+ EVT TargetType = MVT::i32;
+ EVT CarryVT = MVT::i1;
+ const SDValue One = DAG.getConstant(1, SL, TargetType);
+ bool IsAdd = LHS.getOpcode() == ISD::ADD;
+
+ SDValue Op0 = LHS.getOperand(0);
+ SDValue Op1 = LHS.getOperand(1);
+
+ SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, Op0);
+ SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, Op1);
+
+ SDValue Op0Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op0, One);
+ SDValue Op1Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op1, One);
+
+ SDValue NodeLo =
+ DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
+ DAG.getVTList(TargetType, CarryVT), {Op0Lo, Op1Lo});
+
+ SDValue CarryInHi = SDValue(NodeLo.getNode(), 1);
+ SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
+ SL, DAG.getVTList(TargetType, CarryVT),
+ {Op0Hi, Op1Hi, CarryInHi});
+
+ SDValue ResultLo = SDValue(NodeLo.getNode(), 0);
+ SDValue ResultHi = SDValue(NodeHi.getNode(), 0);
+
+ EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
+ SDValue JoinedResult =
+ DAG.getBuildVector(ConcatType, SL, {ResultLo, ResultHi});
+
+ SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
+ SDValue Overflow = SDValue(NodeHi.getNode(), 1);
+ DCI.CombineTo(LHS.getNode(), Result);
+ return Overflow;
+ }
+
if (VT != MVT::f32 && VT != MVT::f64 &&
(!Subtarget->has16BitInsts() || VT != MVT::f16))
return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
new file mode 100644
index 0000000000000..ba0648116e029
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;; Test that carryout from 64-bit add/sub (synthesized from two 32-bit adds/subs) is utilized
+;; (i.e. no additional compare is generated).
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+%0 = type { i64, i64, i32, i32 }
+%1 = type { [64 x [8 x i64]] }
+%struct.uint96 = type { i64, i32 }
+%struct.uint64pair = type { i64, i64 }
+
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+
+declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+define hidden %struct.uint96 @add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
+; CHECK-LABEL: add64_32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %sum64 = add i64 %val64A, %val64B
+ %obit = icmp ult i64 %sum64, %val64A
+ %obit32 = zext i1 %obit to i32
+ %sum32 = add i32 %val32, %obit32
+ %.fca.0.insert = insertvalue %struct.uint96 poison, i64 %sum64, 0
+ %.fca.1.insert = insertvalue %struct.uint96 %.fca.0.insert, i32 %sum32, 1
+ ret %struct.uint96 %.fca.1.insert
+}
+
+define <2 x i64> @uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_v2i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v2, v6
+; CHECK-NEXT: v_add_co_u32_e64 v4, s[4:5], v0, v4
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
+; CHECK-NEXT: v_addc_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
+ %res = sext <2 x i1> %obit to <2 x i64>
+ store <2 x i64> %val, ptr %ptrval
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_v2i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v6
+; CHECK-NEXT: v_sub_co_u32_e64 v4, s[4:5], v0, v4
+; CHECK-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
+; CHECK-NEXT: v_subb_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
+ %res = sext <2 x i1> %obit to <2 x i64>
+ store <2 x i64> %val, ptr %ptrval
+ ret <2 x i64> %res
+}
+
+define i64 @uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
+
+define i64 @uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_p1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
+
+; FIXME: Intermediate compare to generate carryout was transformed. Extend
+; to recognize this.
+define i64 @uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_n1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
+
+define i64 @usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_p1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v0
+; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
+
+define i64 @usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_n1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, -1, v0
+; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b71885b54b5a2..1158d73c0c152 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -841,7 +841,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GCN-ISEL-LABEL: name: vuaddo64
; GCN-ISEL-LABEL: body:
; GCN-ISEL-LABEL: bb.0
-; GCN-ISEL: V_ADD_U64_PSEUDO
+; GCN-ISEL: V_ADD_CO_U32_e64
define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
; CISI-LABEL: vuaddo64:
@@ -854,9 +854,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s4, s0
; CISI-NEXT: v_mov_b32_e32 v1, s9
; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
-; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
; CISI-NEXT: s_mov_b32 s5, s1
+; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s6
@@ -876,7 +875,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: v_mov_b32_e32 v6, s5
; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -894,7 +892,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -909,8 +906,7 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_add_co_u32 v0, s4, s6, v0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_endpgm
@@ -923,9 +919,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: v_add_co_u32 v0, s4, s6, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
-; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W32-NEXT: s_endpgm
@@ -938,9 +933,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s6, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
-; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W64-NEXT: s_endpgm
@@ -955,10 +949,9 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
@@ -969,16 +962,17 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3]
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_u32 v0, s4, s6, v0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
-; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1821,7 +1815,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GCN-ISEL-LABEL: name: vusubo64
; GCN-ISEL-LABEL: body:
; GCN-ISEL-LABEL: bb.0
-; GCN-ISEL: V_SUB_U64_PSEUDO
+; GCN-ISEL: V_SUBB_U32_e64
define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
; CISI-LABEL: vusubo64:
@@ -1834,9 +1828,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s4, s0
; CISI-NEXT: v_mov_b32_e32 v1, s9
; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
-; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
; CISI-NEXT: s_mov_b32 s5, s1
+; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s6
@@ -1856,7 +1849,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: v_mov_b32_e32 v6, s5
; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0
; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -1874,7 +1866,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -1889,8 +1880,7 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_endpgm
@@ -1903,9 +1893,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
-; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W32-NEXT: s_endpgm
@@ -1918,9 +1907,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
-; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W64-NEXT: s_endpgm
@@ -1935,10 +1923,9 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/155255
More information about the llvm-commits
mailing list