[llvm] Reland "[NVPTX] Legalize aext-load to zext-load to expose more DAG combines" (PR #155063)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 22 19:55:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Alex MacLean (AlexMaclean)
<details>
<summary>Changes</summary>
The original version of this change inadvertently dropped b6e19b35cd87f3167a0f04a61a12016b935ab1ea. This version retains that fix as well as adding tests for it and an explanation for why it is needed.
---
Patch is 141.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155063.diff
13 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+1-1)
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+74-100)
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (-2)
- (modified) llvm/test/CodeGen/Mips/implicit-sret.ll (+4-12)
- (modified) llvm/test/CodeGen/Mips/msa/basic_operations.ll (+4-4)
- (modified) llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll (+5-5)
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll (+180-198)
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll (+180-198)
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll (+180-198)
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg.ll (+100-110)
- (modified) llvm/test/CodeGen/NVPTX/i8x2-instructions.ll (+115)
- (modified) llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll (+30-34)
- (modified) llvm/test/CodeGen/NVPTX/mulwide.ll (+4-6)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 27b5a0d37b679..e733f680dc345 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15137,7 +15137,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
return foldedExt;
} else if (ISD::isNON_EXTLoad(N0.getNode()) &&
ISD::isUNINDEXEDLoad(N0.getNode()) &&
- TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+ TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
bool DoXform = true;
SmallVector<SDNode *, 4> SetCCs;
if (!N0.hasOneUse())
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb4bb1195f78b..997c33f1f6a76 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -702,57 +702,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// intrinsics.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- // Turn FP extload into load/fpextend
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
- // Turn FP truncstore into trunc + store.
- // FIXME: vector types should also be expanded
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f16, Expand);
- setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
- setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f32, Expand);
- setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
- setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
+ // FP extload/truncstore is not legal in PTX. We need to expand all these.
+ for (auto FloatVTs :
+ {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) {
+ for (MVT ValVT : FloatVTs) {
+ for (MVT MemVT : FloatVTs) {
+ setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
+ setTruncStoreAction(ValVT, MemVT, Expand);
+ }
+ }
+ }
- // PTX does not support load / store predicate registers
- setOperationAction(ISD::LOAD, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::i1, Custom);
+ // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+ // how they'll be lowered in ISel anyway, and by doing this a little earlier
+ // we allow for more DAG combine opportunities.
+ for (auto IntVTs :
+ {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()})
+ for (MVT ValVT : IntVTs)
+ for (MVT MemVT : IntVTs)
+ if (isTypeLegal(ValVT))
+ setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
+ // PTX does not support load / store predicate registers
+ setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1,
+ Promote);
setTruncStoreAction(VT, MVT::i1, Expand);
}
+ // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+ // expansion for these nodes when they are unaligned is incorrect if the
+ // type is a vector.
+ //
+ // TODO: Fix the generic expansion for these nodes found in
+ // TargetLowering::expandUnalignedLoad/Store.
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
+ MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+
+ // Register custom handling for illegal type loads/stores. We'll try to custom
+ // lower almost all illegal types and logic in the lowering will discard cases
+ // we can't handle.
+ setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
+ for (MVT VT : MVT::fixedlen_vector_valuetypes())
+ if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
+ setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
+
+ // Custom legalization for LDU intrinsics.
+ // TODO: The logic to lower these is not very robust and we should rewrite it.
+ // Perhaps LDU should not be represented as an intrinsic at all.
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+ for (MVT VT : MVT::fixedlen_vector_valuetypes())
+ if (IsPTXVectorType(VT))
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+
setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE,
ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT,
ISD::SETGE, ISD::SETLE},
MVT::i1, Expand);
- // expand extload of vector of integers.
- setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
- MVT::v2i8, Expand);
- setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
-
// This is legal in NVPTX
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -767,24 +776,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// DEBUGTRAP can be lowered to PTX brkpt
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
- // Register custom handling for vector loads/stores
- for (MVT VT : MVT::fixedlen_vector_valuetypes())
- if (IsPTXVectorType(VT))
- setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT,
- Custom);
-
- setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN},
- {MVT::i128, MVT::f128}, Custom);
-
// Support varargs.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
- // Custom handling for i8 intrinsics
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
{MVT::i16, MVT::i32, MVT::i64}, Legal);
@@ -3092,39 +3089,14 @@ static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results,
const NVPTXSubtarget &STI);
-SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- return LowerLOADi1(Op, DAG);
-
- EVT VT = Op.getValueType();
-
- if (NVPTX::isPackedVectorTy(VT)) {
- // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
- // handle unaligned loads and have to handle it here.
- LoadSDNode *Load = cast<LoadSDNode>(Op);
- EVT MemVT = Load->getMemoryVT();
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- MemVT, *Load->getMemOperand())) {
- SDValue Ops[2];
- std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
- return DAG.getMergeValues(Ops, SDLoc(Op));
- }
- }
-
- return SDValue();
-}
-
// v = ld i1* addr
// =>
// v1 = ld i8* addr (-> i16)
// v = trunc i16 to i1
-SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- LoadSDNode *LD = cast<LoadSDNode>(Node);
- SDLoc dl(Node);
+static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) {
+ SDLoc dl(LD);
assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
- assert(Node->getValueType(0) == MVT::i1 &&
- "Custom lowering for i1 load only");
+ assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
LD->getBasePtr(), LD->getPointerInfo(),
MVT::i8, LD->getAlign(),
@@ -3133,8 +3105,27 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
// The legalizer (the caller) is expecting two values from the legalized
// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
// in LegalizeDAG.cpp which also uses MergeValues.
- SDValue Ops[] = { result, LD->getChain() };
- return DAG.getMergeValues(Ops, dl);
+ return DAG.getMergeValues({result, LD->getChain()}, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+
+ if (Op.getValueType() == MVT::i1)
+ return lowerLOADi1(LD, DAG);
+
+ // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+ // how they'll be lowered in ISel anyway, and by doing this a little earlier
+ // we allow for more DAG combine opportunities.
+ if (LD->getExtensionType() == ISD::EXTLOAD) {
+ assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
+ "Unexpected fpext-load");
+ return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
+ LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
+ LD->getMemOperand());
+ }
+
+ llvm_unreachable("Unexpected custom lowering for load");
}
SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -3144,17 +3135,6 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::i1)
return LowerSTOREi1(Op, DAG);
- // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
- // handle unaligned stores and have to handle it here.
- if (NVPTX::isPackedVectorTy(VT) &&
- !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand()))
- return expandUnalignedStore(Store, DAG);
-
- // v2f16/v2bf16/v2i16 don't need special handling.
- if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector())
- return SDValue();
-
// Lower store of any other vector type, including v2f32 as we want to break
// it apart since this is not a widely-supported type.
return LowerSTOREVector(Op, DAG);
@@ -4010,14 +3990,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_p: {
- auto &DL = I.getDataLayout();
Info.opc = ISD::INTRINSIC_W_CHAIN;
- if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
- Info.memVT = getValueType(DL, I.getType());
- else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
- Info.memVT = getPointerTy(DL);
- else
- Info.memVT = getValueType(DL, I.getType());
+ Info.memVT = getValueType(I.getDataLayout(), I.getType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 27f099e220976..e7f1a4b4c98c4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -309,8 +309,6 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
-
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/Mips/implicit-sret.ll b/llvm/test/CodeGen/Mips/implicit-sret.ll
index c8400abacaf8c..88e5119c5ed5b 100644
--- a/llvm/test/CodeGen/Mips/implicit-sret.ll
+++ b/llvm/test/CodeGen/Mips/implicit-sret.ll
@@ -19,9 +19,7 @@ define internal void @test() unnamed_addr nounwind {
; CHECK-NEXT: ld $6, 24($sp)
; CHECK-NEXT: ld $5, 16($sp)
; CHECK-NEXT: ld $7, 32($sp)
-; CHECK-NEXT: lw $1, 0($sp)
-; CHECK-NEXT: # implicit-def: $a0_64
-; CHECK-NEXT: move $4, $1
+; CHECK-NEXT: lw $4, 0($sp)
; CHECK-NEXT: jal use_sret
; CHECK-NEXT: nop
; CHECK-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload
@@ -64,15 +62,9 @@ define internal void @test2() unnamed_addr nounwind {
; CHECK-NEXT: daddiu $4, $sp, 0
; CHECK-NEXT: jal implicit_sret_decl2
; CHECK-NEXT: nop
-; CHECK-NEXT: lw $1, 20($sp)
-; CHECK-NEXT: lw $2, 12($sp)
-; CHECK-NEXT: lw $3, 4($sp)
-; CHECK-NEXT: # implicit-def: $a0_64
-; CHECK-NEXT: move $4, $3
-; CHECK-NEXT: # implicit-def: $a1_64
-; CHECK-NEXT: move $5, $2
-; CHECK-NEXT: # implicit-def: $a2_64
-; CHECK-NEXT: move $6, $1
+; CHECK-NEXT: lw $6, 20($sp)
+; CHECK-NEXT: lw $5, 12($sp)
+; CHECK-NEXT: lw $4, 4($sp)
; CHECK-NEXT: jal use_sret2
; CHECK-NEXT: nop
; CHECK-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
index 4fc3f57aa002d..c3889372b322e 100644
--- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
@@ -1904,7 +1904,7 @@ define void @insert_v16i8_vidx(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8_vidx)))
; N64-NEXT: ld $2, %got_disp(i32)($1)
-; N64-NEXT: lw $2, 0($2)
+; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: andi $2, $2, 15
; N64-NEXT: ld $1, %got_disp(v16i8)($1)
; N64-NEXT: daddu $1, $1, $2
@@ -1953,7 +1953,7 @@ define void @insert_v8i16_vidx(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16_vidx)))
; N64-NEXT: ld $2, %got_disp(i32)($1)
-; N64-NEXT: lw $2, 0($2)
+; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: andi $2, $2, 7
; N64-NEXT: ld $1, %got_disp(v8i16)($1)
; N64-NEXT: dlsa $1, $2, $1, 1
@@ -2002,7 +2002,7 @@ define void @insert_v4i32_vidx(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32_vidx)))
; N64-NEXT: ld $2, %got_disp(i32)($1)
-; N64-NEXT: lw $2, 0($2)
+; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: andi $2, $2, 3
; N64-NEXT: ld $1, %got_disp(v4i32)($1)
; N64-NEXT: dlsa $1, $2, $1, 2
@@ -2053,7 +2053,7 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64_vidx)))
; N64-NEXT: ld $2, %got_disp(i32)($1)
-; N64-NEXT: lw $2, 0($2)
+; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: andi $2, $2, 1
; N64-NEXT: ld $1, %got_disp(v2i64)($1)
; N64-NEXT: dlsa $1, $2, $1, 3
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index bd4c7775354ae..6c4ae1937e158 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -711,11 +711,11 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_0];
-; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_param_1];
-; CHECK-NEXT: and.b32 %r3, %r2, -2147450880;
-; CHECK-NEXT: and.b32 %r4, %r1, 2147450879;
-; CHECK-NEXT: or.b32 %r5, %r4, %r3;
+; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_1];
+; CHECK-NEXT: and.b32 %r2, %r1, -2147450880;
+; CHECK-NEXT: ld.param.b32 %r3, [test_copysign_param_0];
+; CHECK-NEXT: and.b32 %r4, %r3, 2147450879;
+; CHECK-NEXT: or.b32 %r5, %r4, %r2;
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 6e480996e7e6a..9717efb960f18 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -7,7 +7,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<18>;
+; SM60-NEXT: .reg .b32 %r<17>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
@@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
; SM60-NEXT: shl.b32 %r11, %r10, %r1;
; SM60-NEXT: not.b32 %r2, %r11;
; SM60-NEXT: cvt.u32.u16 %r12, %rs1;
-; SM60-NEXT: and.b32 %r13, %r12, 255;
-; SM60-NEXT: shl.b32 %r3, %r13, %r1;
+; SM60-NEXT: shl.b32 %r3, %r12, %r1;
; SM60-NEXT: shl.b32 %r4, %r7, %r1;
-; SM60-NEXT: ld.global.b32 %r14, [%rd1];
-; SM60-NEXT: and.b32 %r17, %r14, %r2;
+; SM60-NEXT: ld.global.b32 %r13, [%rd1];
+; SM60-NEXT: and.b32 %r16, %r13, %r2;
; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r15, %r17, %r3;
-; SM60-NEXT: or.b32 %r16, %r17, %r4;
-; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
-; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16;
+; SM60-NEXT: or.b32 %r14, %r16, %r3;
+; SM60-NEXT: or.b32 %r15, %r16, %r4;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15;
; SM60-NEXT: @%p1 bra $L__BB0_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB0_1 Depth=1
; SM60-NEXT: and.b32 %r6, %r5, %r2;
-; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT: mov.b32 %r17, %r6;
+; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT: mov.b32 %r16, %r6;
; SM60-NEXT: @%p2 bra $L__BB0_1;
; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end
; SM60-NEXT: st.param.b32 [func_retval0], %r12;
@@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<18>;
+; SM60-NEXT: .reg .b32 %r<17>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
@@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
; SM60-NEXT: shl.b32 %r11, %r10, %r1;
; SM60-NEXT: not.b32 %r2, %r11;
; SM60-NEXT: cvt.u32.u16 %r12, %rs1;
-; SM60-NEXT: and.b32 %r13, %r12, 255;
-; SM60-NEXT: shl.b32 %r3, %r13, %r1;
+; SM60-NEXT: shl.b32 %r3, %r12, %r1;
; SM60-NEXT: shl.b32 %r4, %r7, %r1;
-; SM60-NEXT: ld.global.b32 %r14, [%rd1];
-; SM60-NEXT: and.b32 %r17, %r14, %r2;
+; SM60-NEXT: ld.global.b32 %r13, [%rd1];
+; SM60-NEXT: and.b32 %r16, %r13, %r2;
; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r15, %r17, %r3;
-; SM60-NEXT: or.b32 %r16, %r17, %r4;
-; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
-; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16;
+; SM60-NEXT: or.b32 %r14, %r16, %r3;
+; SM60-NEXT: or.b32 %r15, %r16, %r4;
+; SM60-NEXT: atom.cta.global...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/155063
More information about the llvm-commits
mailing list