[llvm] 45ca943 - [AMDGPU] Select no-return atomic intrinsics in tblgen
Abinav Puthan Purayil via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 21 21:10:23 PDT 2022
Author: Abinav Puthan Purayil
Date: 2022-04-22T09:37:40+05:30
New Revision: 45ca94334ef16711f896506e613b4abb28bbc37c
URL: https://github.com/llvm/llvm-project/commit/45ca94334ef16711f896506e613b4abb28bbc37c
DIFF: https://github.com/llvm/llvm-project/commit/45ca94334ef16711f896506e613b4abb28bbc37c.diff
LOG: [AMDGPU] Select no-return atomic intrinsics in tblgen
This is to avoid relying on the post-isel hook.
This change also enable the saddr pattern selection for atomic
intrinsics in GlobalISel.
Differential Revision: https://reviews.llvm.org/D123583
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/lib/Target/AMDGPU/BUFInstructions.td
llvm/lib/Target/AMDGPU/DSInstructions.td
llvm/lib/Target/AMDGPU/FLATInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 9403d22b9227e..15ea0597376cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -540,6 +540,31 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>;
// TODO: Add GISelPredicateCode for the ret and noret PatFrags once
// GlobalISelEmitter allows pattern matches where src and dst def count
// mismatch.
+
+multiclass ret_noret_op {
+ let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
+ GISelPredicateCode = [{ return true; }] in {
+ def "_ret" : PatFrag<(ops node:$ptr, node:$data),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
+ }
+
+ let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
+ GISelPredicateCode = [{ return false; }] in {
+ def "_noret" : PatFrag<(ops node:$ptr, node:$data),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
+ }
+}
+
+defm int_amdgcn_flat_atomic_fadd : ret_noret_op;
+defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op;
+defm int_amdgcn_flat_atomic_fmin : ret_noret_op;
+defm int_amdgcn_flat_atomic_fmax : ret_noret_op;
+defm int_amdgcn_global_atomic_fadd : ret_noret_op;
+defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op;
+defm int_amdgcn_global_atomic_fmin : ret_noret_op;
+defm int_amdgcn_global_atomic_fmax : ret_noret_op;
+defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op;
+
multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
GISelPredicateCode = [{ return false; }] in {
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 4e7efef3f9b15..0d8a84ccd6289 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1188,9 +1188,9 @@ let SubtargetPredicate = isGFX90APlus in {
let SubtargetPredicate = isGFX90AOnly;
}
- defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
- defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
- defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+ defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
+ defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
+ defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = isGFX90APlus
def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
@@ -1381,10 +1381,11 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
// buffer_atomic patterns
//===----------------------------------------------------------------------===//
-multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst> {
+multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
foreach RtnMode = ["ret", "noret"] in {
- defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode # "_" # vt.Size);
+ defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode
+ # !if(isIntr, "", "_" # vt.Size));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
def : GCNPat<
@@ -1592,6 +1593,9 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_P
}
let SubtargetPredicate = isGFX90APlus in {
+ defm : BufferAtomicPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64", 1>;
+ defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64", 1>;
+ defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64", 1>;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 049d8000d31ce..1732fd42d6aa7 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1025,9 +1025,13 @@ def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
let SubtargetPredicate = isGFX940Plus in {
def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
+ (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)),
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
>;
+def : GCNPat <
+ (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
+ (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
+>;
}
def : Pat <
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 3f0c42578a11c..7b457ab1258bc 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -711,17 +711,17 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
} // End SubtargetPredicate = isGFX7GFX10
let SubtargetPredicate = isGFX90APlus in {
- defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
- defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
- defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
- defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
- defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
- defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+ defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
+ defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
+ defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = isGFX90APlus
let SubtargetPredicate = isGFX940Plus in {
- defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32, int_amdgcn_flat_atomic_fadd>;
- defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_flat_atomic_fadd>;
+ defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>;
+ defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>;
defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
} // End SubtargetPredicate = isGFX940Plus
@@ -897,15 +897,15 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_FCMPSWAP :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>;
defm GLOBAL_ATOMIC_FMIN :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
defm GLOBAL_ATOMIC_FMAX :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>;
defm GLOBAL_ATOMIC_FMIN_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
defm GLOBAL_ATOMIC_FMAX_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
let is_flat_global = 1 in {
@@ -920,10 +920,10 @@ let OtherPredicates = [HasAtomicFaddInsts] in {
let OtherPredicates = [isGFX90APlus] in {
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
- "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
+ "global_atomic_add_f32", VGPR_32, f32
>;
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
- "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
+ "global_atomic_pk_add_f16", VGPR_32, v2f16
>;
} // End OtherPredicates = [isGFX90APlus]
} // End is_flat_global = 1
@@ -1029,13 +1029,30 @@ multiclass FlatAtomicPat <string inst, string node, ValueType vt,
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
+multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> {
+ defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+ defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+
+ def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+ def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+}
+
+multiclass FlatSignedIntrPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data),
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
-class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
- ValueType data_vt = vt> : GCNPat <
+class FlatSignedAtomicPatRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
+ ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
@@ -1237,7 +1254,7 @@ multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, V
multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> {
- def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
+ def : FlatSignedAtomicPatRtn <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
let AddedComplexity = 10;
}
@@ -1247,13 +1264,12 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod
}
multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
- ValueType data_vt = vt> {
- defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size);
- defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
+ ValueType data_vt = vt, bit isIntr = 0> {
+ defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+ defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
let AddedComplexity = 10 in {
- def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
- def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
+ defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>;
}
let AddedComplexity = 11 in {
@@ -1262,6 +1278,11 @@ multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
}
}
+multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> {
def : FlatSignedAtomicPatNoRtn <inst, node, vt> {
@@ -1427,6 +1448,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f3
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
}
let OtherPredicates = [HasAtomicFaddInsts] in {
@@ -1440,19 +1465,26 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_ret_64, f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64, atomic_load_fadd_flat_noret_64, f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_ret_64, f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64, atomic_load_fmin_flat_noret_64, f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_ret_64, f64>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64, atomic_load_fmax_flat_noret_64, f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
+defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
+defm : FlatSignedIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
+defm : FlatSignedIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
}
let OtherPredicates = [isGFX940Plus] in {
-def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F32_RTN, atomic_load_fadd_flat_32, f32>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_F16_RTN, atomic_load_fadd_v2f16_flat_32, v2f16>;
-def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_BF16_RTN, int_amdgcn_flat_atomic_fadd_v2bf16, v2i16>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_PK_ADD_BF16", int_amdgcn_global_atomic_fadd_v2bf16, v2i16>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>;
+defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
+defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
+defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
}
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index f6a84f553f3b7..eec9e8ad3e271 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -319,10 +319,10 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %pt
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] glc
; GFX90A-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
@@ -333,10 +333,10 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
; GFX90A-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
@@ -347,10 +347,10 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
; GFX90A-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
index aa195f8c01764..26373b390e3c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
@@ -74,10 +74,10 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NEXT: global_atomic_add_f32 v0, v1, v0, s[0:1] offset:2048 glc
; GFX90A-NEXT: s_endpgm
%gep = getelementptr float, float addrspace(1)* %ptr, i64 512
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll
index 823d4411f4552..e1e2781569e7c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll
@@ -23,13 +23,12 @@ define amdgpu_kernel void @global_atomic_fmin_f32_noret(float addrspace(1)* %ptr
; G_GFX10-LABEL: global_atomic_fmin_f32_noret:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_clause 0x1
-; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s4
-; G_GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
+; G_GFX10-NEXT: global_atomic_fmin v0, v1, v0, s[2:3] glc
; G_GFX10-NEXT: s_endpgm
main_body:
%ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
@@ -51,13 +50,12 @@ define amdgpu_kernel void @global_atomic_fmax_f32_noret(float addrspace(1)* %ptr
; G_GFX10-LABEL: global_atomic_fmax_f32_noret:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_clause 0x1
-; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s4
-; G_GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
+; G_GFX10-NEXT: global_atomic_fmax v0, v1, v0, s[2:3] glc
; G_GFX10-NEXT: s_endpgm
main_body:
%ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
@@ -120,12 +118,11 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
; G_GFX10-LABEL: global_atomic_fmin_f64_noret:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
-; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
-; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v2, v[0:1], s[0:1] glc
; G_GFX10-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
@@ -146,12 +143,11 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
; G_GFX10-LABEL: global_atomic_fmax_f64_noret:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
-; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
-; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v2, v[0:1], s[0:1] glc
; G_GFX10-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
More information about the llvm-commits
mailing list