[llvm] [WIP][AMDGPU] combine uniform AMDGPU lane Intrinsics (PR #116953)

Diana Picus via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 21 06:25:40 PST 2024


================
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes="instcombine,amdgpu-uniform-intrinsic-combine" -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+
+define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX-NEXT:    store i32 77, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.permlane64(i32 77)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_sgpr(ptr addrspace(1) %out, i32 %src) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @permlane64_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid2 = add i32 %tid, 1
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC0]], i32 [[SRC1]])
+; GFX-NEXT:    store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @readlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readlane_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT:    [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; GFX-NEXT:    [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %tidx2 = add i32 %tidx, 1
+  %tidy2 = add i32 %tidy, 2
+  %v = call i32 @llvm.amdgcn.readlane(i32 %tidx2, i32 %tidy2)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_constant(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    store i32 7, ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readfirstlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_undef(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readfirstlane_sgpr(ptr addrspace(1) %out, i32 %src0) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_sgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; GFX-NEXT:    store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readfirstlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @readfirstlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: define amdgpu_kernel void @readfirstlane_vgpr_expression(
+; GFX-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; GFX-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TID2]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid2 = add i32 %tid, 1
+  %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid2)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
----------------
rovka wrote:

Tests for all the combinations of read[first]lane of read[first]lane?

https://github.com/llvm/llvm-project/pull/116953


More information about the llvm-commits mailing list