[llvm] [NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations (PR #98551)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 31 11:24:48 PDT 2024
================
@@ -578,47 +1011,134 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store volatile double %f.add, ptr addrspace(3) %c
+ ; TODO: should be combined into single .u16 op
+ ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load volatile <2 x i8>, ptr addrspace(3) %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile <2 x i8> %h.add, ptr addrspace(3) %b
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load volatile <4 x i8>, ptr addrspace(3) %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile <4 x i8> %i.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load volatile <2 x i16>, ptr addrspace(3) %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile <2 x i16> %j.add, ptr addrspace(3) %c
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load volatile <4 x i16>, ptr addrspace(3) %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile <4 x i16> %k.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load volatile <2 x i32>, ptr addrspace(3) %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile <2 x i32> %l.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load volatile <4 x i32>, ptr addrspace(3) %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile <4 x i32> %m.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load volatile <2 x i64>, ptr addrspace(3) %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store volatile <2 x i64> %n.add, ptr addrspace(3) %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; TODO: should be combined into single .u64 op
----------------
Artem-B wrote:
> Right now, it seems that we canonicalize a vector to scalars in some cases (e.g. <i8 x 4> to .u32 ops), but not others (e.g. <i16 x 2> lowers to v2u16 ops).
> Do you know why?
Probably due to https://github.com/llvm/llvm-project/pull/67866 handling v4i8 as a special case.
https://github.com/llvm/llvm-project/pull/98551
More information about the llvm-commits
mailing list