[llvm] [NVPTX] Legalize ctpop and ctlz in operation legalization (PR #130668)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 11 09:57:53 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/130668
>From ba2619fc5ce5c65825dfd5583777c82aac6363f6 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Mon, 3 Mar 2025 01:03:08 +0000
Subject: [PATCH 1/4] pre-commit tests -- use update_llc_test_checks.py
---
llvm/test/CodeGen/NVPTX/ctlz.ll | 177 +++++++++++++--------
llvm/test/CodeGen/NVPTX/intrinsics.ll | 216 +++++++++++++++++++-------
2 files changed, 277 insertions(+), 116 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
index 9f91504ad9966..c4c1889a1f29e 100644
--- a/llvm/test/CodeGen/NVPTX/ctlz.ll
+++ b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
@@ -10,44 +11,62 @@ declare i64 @llvm.ctlz.i64(i64, i1) readnone
; There should be no difference between llvm.ctlz.i32(%a, true) and
; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
-; CHECK-LABEL: myctlz(
define i32 @myctlz(i32 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0];
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
ret i32 %val
}
-; CHECK-LABEL: myctlz_2(
define i32 @myctlz_2(i32 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0];
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
ret i32 %val
}
; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
; value, so here we have to zero-extend it.
-; CHECK-LABEL: myctlz64(
define i64 @myctlz64(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: cvt.u64.u32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0];
+; CHECK-NEXT: clz.b64 %r1, %rd1;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
ret i64 %val
}
-; CHECK-LABEL: myctlz64_2(
define i64 @myctlz64_2(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: cvt.u64.u32
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0];
+; CHECK-NEXT: clz.b64 %r1, %rd1;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
ret i64 %val
}
@@ -55,22 +74,32 @@ define i64 @myctlz64_2(i64 %a) {
; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
; natural return width of ptx's clz.b64 instruction. No conversions should be
; necessary in the PTX.
-; CHECK-LABEL: myctlz64_as_32(
define i32 @myctlz64_as_32(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_as_32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0];
+; CHECK-NEXT: clz.b64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
%trunc = trunc i64 %val to i32
ret i32 %trunc
}
-; CHECK-LABEL: myctlz64_as_32_2(
define i32 @myctlz64_as_32_2(i64 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: clz.b64
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz64_as_32_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
+; CHECK-NEXT: clz.b64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
%trunc = trunc i64 %val to i32
ret i32 %trunc
@@ -80,53 +109,77 @@ define i32 @myctlz64_as_32_2(i64 %a) {
; and then truncating the result back down to i16. But the NVPTX ABI
; zero-extends i16 return values to i32, so the final truncation doesn't appear
; in this function.
-; CHECK-LABEL: myctlz_ret16(
define i16 @myctlz_ret16(i16 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: sub.
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_ret16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_ret16_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: sub.s32 %r3, %r2, 16;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
ret i16 %val
}
-; CHECK-LABEL: myctlz_ret16_2(
define i16 @myctlz_ret16_2(i16 %a) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-NEXT: sub.
-; CHECK-NEXT: st.param.
-; CHECK-NEXT: ret;
+; CHECK-LABEL: myctlz_ret16_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_ret16_2_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: sub.s32 %r3, %r2, 16;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
ret i16 %val
}
; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
; remain.
-; CHECK-LABEL: myctlz_store16(
define void @myctlz_store16(i16 %a, ptr %b) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-DAG: cvt.u16.u32
-; CHECK-DAG: sub.
-; CHECK: st.{{[a-z]}}16
-; CHECK: ret;
+; CHECK-LABEL: myctlz_store16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_store16_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
+; CHECK-NEXT: sub.s16 %rs3, %rs2, 16;
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1];
+; CHECK-NEXT: st.u16 [%rd1], %rs3;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
store i16 %val, ptr %b
ret void
}
-; CHECK-LABEL: myctlz_store16_2(
define void @myctlz_store16_2(i16 %a, ptr %b) {
-; CHECK: ld.param.
-; CHECK-NEXT: cvt.u32.u16
-; CHECK-NEXT: clz.b32
-; CHECK-DAG: cvt.u16.u32
-; CHECK-DAG: sub.
-; CHECK: st.{{[a-z]}}16
-; CHECK: ret;
+; CHECK-LABEL: myctlz_store16_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_store16_2_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: clz.b32 %r2, %r1;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
+; CHECK-NEXT: sub.s16 %rs3, %rs2, 16;
+; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1];
+; CHECK-NEXT: st.u16 [%rd1], %rs3;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
store i16 %val, ptr %b
ret void
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index e424e72ecc8f5..0c324ea0912bf 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,61 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify %}
-; CHECK-LABEL: test_fabsf(
define float @test_fabsf(float %f) {
-; CHECK: abs.f32
+; CHECK-LABEL: test_fabsf(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [test_fabsf_param_0];
+; CHECK-NEXT: abs.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%x = call float @llvm.fabs.f32(float %f)
ret float %x
}
-; CHECK-LABEL: test_fabs(
define double @test_fabs(double %d) {
-; CHECK: abs.f64
+; CHECK-LABEL: test_fabs(
+; CHECK: {
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f64 %fd1, [test_fabs_param_0];
+; CHECK-NEXT: abs.f64 %fd2, %fd1;
+; CHECK-NEXT: st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT: ret;
%x = call double @llvm.fabs.f64(double %d)
ret double %x
}
-; CHECK-LABEL: test_nvvm_sqrt(
define float @test_nvvm_sqrt(float %a) {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_nvvm_sqrt(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [test_nvvm_sqrt_param_0];
+; CHECK-NEXT: sqrt.rn.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%val = call float @llvm.nvvm.sqrt.f(float %a)
ret float %val
}
-; CHECK-LABEL: test_llvm_sqrt(
define float @test_llvm_sqrt(float %a) {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_llvm_sqrt(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [test_llvm_sqrt_param_0];
+; CHECK-NEXT: sqrt.rn.f32 %f2, %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%val = call float @llvm.sqrt.f32(float %a)
ret float %val
}
-; CHECK-LABEL: test_bitreverse32(
define i32 @test_bitreverse32(i32 %a) {
-; CHECK: brev.b32
+; CHECK-LABEL: test_bitreverse32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_bitreverse32_param_0];
+; CHECK-NEXT: brev.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.bitreverse.i32(i32 %a)
ret i32 %val
}
-; CHECK-LABEL: test_bitreverse64(
define i64 @test_bitreverse64(i64 %a) {
-; CHECK: brev.b64
+; CHECK-LABEL: test_bitreverse64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_bitreverse64_param_0];
+; CHECK-NEXT: brev.b64 %rd2, %rd1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.bitreverse.i64(i64 %a)
ret i64 %val
}
-; CHECK-LABEL: test_popc32(
define i32 @test_popc32(i32 %a) {
-; CHECK: popc.b32
+; CHECK-LABEL: test_popc32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_popc32_param_0];
+; CHECK-NEXT: popc.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i32 @llvm.ctpop.i32(i32 %a)
ret i32 %val
}
-; CHECK-LABEL: test_popc64
define i64 @test_popc64(i64 %a) {
-; CHECK: popc.b64
-; CHECK: cvt.u64.u32
+; CHECK-LABEL: test_popc64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_param_0];
+; CHECK-NEXT: popc.b64 %r1, %rd1;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctpop.i64(i64 %a)
ret i64 %val
}
@@ -63,10 +121,17 @@ define i64 @test_popc64(i64 %a) {
; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so
; if this function returns an i32, there's no need to do any type conversions
; in the ptx.
-; CHECK-LABEL: test_popc64_trunc
define i32 @test_popc64_trunc(i64 %a) {
-; CHECK: popc.b64
-; CHECK-NOT: cvt.
+; CHECK-LABEL: test_popc64_trunc(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_trunc_param_0];
+; CHECK-NEXT: popc.b64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%val = call i64 @llvm.ctpop.i64(i64 %a)
%trunc = trunc i64 %val to i32
ret i32 %trunc
@@ -74,11 +139,7 @@ define i32 @test_popc64_trunc(i64 %a) {
; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and
; then converting back to i16.
-; CHECK-LABEL: test_popc16
define void @test_popc16(i16 %a, ptr %b) {
-; CHECK: cvt.u32.u16
-; CHECK: popc.b32
-; CHECK: cvt.u16.u32
%val = call i16 @llvm.ctpop.i16(i16 %a)
store i16 %val, ptr %b
ret void
@@ -86,11 +147,18 @@ define void @test_popc16(i16 %a, ptr %b) {
; If we call llvm.ctpop.i16 and then zext the result to i32, we shouldn't need
; to do any conversions after calling popc.b32, because that returns an i32.
-; CHECK-LABEL: test_popc16_to_32
define i32 @test_popc16_to_32(i16 %a) {
-; CHECK: cvt.u32.u16
-; CHECK: popc.b32
-; CHECK-NOT: cvt.
+; CHECK-LABEL: test_popc16_to_32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [test_popc16_to_32_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: popc.b32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%val = call i16 @llvm.ctpop.i16(i16 %a)
%zext = zext i16 %val to i32
ret i32 %zext
@@ -98,78 +166,118 @@ define i32 @test_popc16_to_32(i16 %a) {
; Most of nvvm.read.ptx.sreg.* intrinsics always return the same value and may
; be CSE'd.
-; CHECK-LABEL: test_tid
define i32 @test_tid() {
-; CHECK: mov.u32 %r{{.*}}, %tid.x;
+; CHECK-LABEL: test_tid(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, %tid.x;
+; CHECK-NEXT: add.s32 %r2, %r1, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%a = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; CHECK-NOT: mov.u32 %r{{.*}}, %tid.x;
%b = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%ret = add i32 %a, %b
-; CHECK: ret
ret i32 %ret
}
; reading clock() or clock64() should not be CSE'd as each read may return
; different value.
-; CHECK-LABEL: test_clock
define i32 @test_clock() {
-; CHECK: mov.u32 %r{{.*}}, %clock;
+; CHECK-LABEL: test_clock(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, %clock;
+; CHECK-NEXT: mov.u32 %r2, %clock;
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%a = tail call i32 @llvm.nvvm.read.ptx.sreg.clock()
-; CHECK: mov.u32 %r{{.*}}, %clock;
%b = tail call i32 @llvm.nvvm.read.ptx.sreg.clock()
%ret = add i32 %a, %b
-; CHECK: ret
ret i32 %ret
}
-; CHECK-LABEL: test_clock64
define i64 @test_clock64() {
-; CHECK: mov.u64 %r{{.*}}, %clock64;
+; CHECK-LABEL: test_clock64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u64 %rd1, %clock64;
+; CHECK-NEXT: mov.u64 %rd2, %clock64;
+; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%a = tail call i64 @llvm.nvvm.read.ptx.sreg.clock64()
-; CHECK: mov.u64 %r{{.*}}, %clock64;
%b = tail call i64 @llvm.nvvm.read.ptx.sreg.clock64()
%ret = add i64 %a, %b
-; CHECK: ret
ret i64 %ret
}
-; CHECK-LABEL: test_exit
define void @test_exit() {
-; CHECK: exit;
+; CHECK-LABEL: test_exit(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: exit;
+; CHECK-NEXT: ret;
call void @llvm.nvvm.exit()
ret void
}
-; CHECK-LABEL: test_globaltimer
define i64 @test_globaltimer() {
-; CHECK: mov.u64 %r{{.*}}, %globaltimer;
+; CHECK-LABEL: test_globaltimer(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u64 %rd1, %globaltimer;
+; CHECK-NEXT: mov.u64 %rd2, %globaltimer;
+; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%a = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
-; CHECK: mov.u64 %r{{.*}}, %globaltimer;
%b = tail call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
%ret = add i64 %a, %b
-; CHECK: ret
ret i64 %ret
}
-; CHECK-LABEL: test_cyclecounter
define i64 @test_cyclecounter() {
-; CHECK: mov.u64 %r{{.*}}, %clock64;
+; CHECK-LABEL: test_cyclecounter(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u64 %rd1, %clock64;
+; CHECK-NEXT: mov.u64 %rd2, %clock64;
+; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%a = tail call i64 @llvm.readcyclecounter()
-; CHECK: mov.u64 %r{{.*}}, %clock64;
%b = tail call i64 @llvm.readcyclecounter()
%ret = add i64 %a, %b
-; CHECK: ret
ret i64 %ret
}
-; CHECK-LABEL: test_steadycounter
define i64 @test_steadycounter() {
-; CHECK: mov.u64 %r{{.*}}, %globaltimer;
+; CHECK-LABEL: test_steadycounter(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u64 %rd1, %globaltimer;
+; CHECK-NEXT: mov.u64 %rd2, %globaltimer;
+; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%a = tail call i64 @llvm.readsteadycounter()
-; CHECK: mov.u64 %r{{.*}}, %globaltimer;
%b = tail call i64 @llvm.readsteadycounter()
%ret = add i64 %a, %b
-; CHECK: ret
ret i64 %ret
}
>From ee990a904cb5d0a57523b0017cdace3643abc8ee Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Mon, 3 Mar 2025 17:51:00 +0000
Subject: [PATCH 2/4] [NVPTX] Legalize ctpop and ctlz in operation legalization
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 53 ++++++++++++---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 71 ++++-----------------
llvm/test/CodeGen/NVPTX/ctlz.ll | 34 ++++------
llvm/test/CodeGen/NVPTX/intrinsics.ll | 4 +-
4 files changed, 68 insertions(+), 94 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b62c15ddb97d3..0627ddf7d4ee9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -764,16 +764,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Custom handling for i8 intrinsics
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
- for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
- setOperationAction(ISD::ABS, Ty, Legal);
- setOperationAction(ISD::SMIN, Ty, Legal);
- setOperationAction(ISD::SMAX, Ty, Legal);
- setOperationAction(ISD::UMIN, Ty, Legal);
- setOperationAction(ISD::UMAX, Ty, Legal);
+ setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
+ {MVT::i16, MVT::i32, MVT::i64}, Legal);
- setOperationAction(ISD::CTPOP, Ty, Legal);
- setOperationAction(ISD::CTLZ, Ty, Legal);
- }
+ setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal);
+ setOperationAction({ISD::CTPOP, ISD::CTLZ}, {MVT::i16, MVT::i64}, Custom);
setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
@@ -2748,6 +2743,42 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
return Op;
}
+static SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) {
+ SDValue V = Op->getOperand(0);
+ SDLoc DL(Op);
+
+ if (V.getValueType() == MVT::i16) {
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
+ SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Zext);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, CT, SDNodeFlags::NoWrap);
+ }
+ if (V.getValueType() == MVT::i64) {
+ SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, V);
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+ }
+ llvm_unreachable("Unexpected CTPOP type to legalize");
+}
+
+static SDValue lowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+ SDValue V = Op->getOperand(0);
+ SDLoc DL(Op);
+
+ if (V.getValueType() == MVT::i16) {
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
+ SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, Zext);
+ SDValue Sub =
+ DAG.getNode(ISD::ADD, DL, MVT::i32, CT,
+ DAG.getConstant(APInt(32, -16, true), DL, MVT::i32),
+ SDNodeFlags::NoSignedWrap);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Sub, SDNodeFlags::NoWrap);
+ }
+ if (V.getValueType() == MVT::i64) {
+ SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, V);
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+ }
+ llvm_unreachable("Unexpected CTLZ type to legalize");
+}
+
SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -2833,6 +2864,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
// Used only for bf16 on SM80, where we select fma for non-ftz operation
return PromoteBinOpIfF32FTZ(Op, DAG);
+ case ISD::CTPOP:
+ return lowerCTPOP(Op, DAG);
+ case ISD::CTLZ:
+ return lowerCTLZ(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f94d7099f1b0e..3c88551d7b23c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3267,69 +3267,20 @@ def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
(SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
-// Count leading zeros
let hasSideEffects = false in {
- def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "clz.b32 \t$d, $a;", []>;
- def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "clz.b64 \t$d, $a;", []>;
+ foreach RT = [I32RT, I64RT] in {
+ // Count leading zeros
+ def CLZr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+ "clz.b" # RT.Size # " \t$d, $a;",
+ [(set i32:$d, (ctlz RT.Ty:$a))]>;
+
+ // Population count
+ def POPCr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+ "popc.b" # RT.Size # " \t$d, $a;",
+ [(set i32:$d, (ctpop RT.Ty:$a))]>;
+ }
}
-// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
-
-// The return type of the ctlz ISD node is the same as its input, but the PTX
-// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
-// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
-// truncating back down to 32 bits.
-def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
-
-// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
-// result back to 16-bits if necessary. We also need to subtract 16 because
-// the high-order 16 zeros were counted.
-//
-// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
-// use to save one SASS instruction (on sm_35 anyway):
-//
-// mov.b32 $tmp, {0xffff, $a}
-// ctlz.b32 $result, $tmp
-//
-// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
-// and then ctlz that value. This way we don't have to subtract 16 from the
-// result. Unfortunately today we don't have a way to generate
-// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(i16 (ctlz i16:$a)),
- (SUBi16ri (CVT_u16_u32
- (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
- (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
-
-// Population count
-let hasSideEffects = false in {
- def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "popc.b32 \t$d, $a;", []>;
- def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "popc.b64 \t$d, $a;", []>;
-}
-
-// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
-
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
-// to match the LLVM semantics. Just as with ctlz.i64, we provide a second
-// pattern that avoids the type conversion if we're truncating the result to
-// i32 anyway.
-def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
-
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
-// If we know that we're storing into an i32, we can avoid the final trunc.
-def : Pat<(ctpop i16:$a),
- (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
- (POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
-
// fpround f32 -> f16
def : Pat<(f16 (fpround f32:$a)),
(CVT_f16_f32 $a, CvtRN)>;
diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
index c4c1889a1f29e..4f414d600f248 100644
--- a/llvm/test/CodeGen/NVPTX/ctlz.ll
+++ b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -112,14 +112,12 @@ define i32 @myctlz64_as_32_2(i64 %a) {
define i16 @myctlz_ret16(i16 %a) {
; CHECK-LABEL: myctlz_ret16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_ret16_param_0];
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_param_0];
; CHECK-NEXT: clz.b32 %r2, %r1;
-; CHECK-NEXT: sub.s32 %r3, %r2, 16;
+; CHECK-NEXT: add.s32 %r3, %r2, -16;
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
@@ -128,14 +126,12 @@ define i16 @myctlz_ret16(i16 %a) {
define i16 @myctlz_ret16_2(i16 %a) {
; CHECK-LABEL: myctlz_ret16_2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_ret16_2_param_0];
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0];
; CHECK-NEXT: clz.b32 %r2, %r1;
-; CHECK-NEXT: sub.s32 %r3, %r2, 16;
+; CHECK-NEXT: add.s32 %r3, %r2, -16;
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
@@ -147,18 +143,15 @@ define i16 @myctlz_ret16_2(i16 %a) {
define void @myctlz_store16(i16 %a, ptr %b) {
; CHECK-LABEL: myctlz_store16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_store16_param_0];
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_param_0];
; CHECK-NEXT: clz.b32 %r2, %r1;
-; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
-; CHECK-NEXT: sub.s16 %rs3, %rs2, 16;
+; CHECK-NEXT: add.s32 %r3, %r2, -16;
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1];
-; CHECK-NEXT: st.u16 [%rd1], %rs3;
+; CHECK-NEXT: st.u16 [%rd1], %r3;
; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
store i16 %val, ptr %b
@@ -167,18 +160,15 @@ define void @myctlz_store16(i16 %a, ptr %b) {
define void @myctlz_store16_2(i16 %a, ptr %b) {
; CHECK-LABEL: myctlz_store16_2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u16 %rs1, [myctlz_store16_2_param_0];
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_2_param_0];
; CHECK-NEXT: clz.b32 %r2, %r1;
-; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
-; CHECK-NEXT: sub.s16 %rs3, %rs2, 16;
+; CHECK-NEXT: add.s32 %r3, %r2, -16;
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1];
-; CHECK-NEXT: st.u16 [%rd1], %rs3;
+; CHECK-NEXT: st.u16 [%rd1], %r3;
; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
store i16 %val, ptr %b
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 0c324ea0912bf..57493558419c5 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -150,12 +150,10 @@ define void @test_popc16(i16 %a, ptr %b) {
define i32 @test_popc16_to_32(i16 %a) {
; CHECK-LABEL: test_popc16_to_32(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u16 %rs1, [test_popc16_to_32_param_0];
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: ld.param.u16 %r1, [test_popc16_to_32_param_0];
; CHECK-NEXT: popc.b32 %r2, %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
>From 554db43da13880e55bc4b300e5f540afd77e5048 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Mon, 10 Mar 2025 22:40:53 +0000
Subject: [PATCH 3/4] address comments + cleanup
---
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 3 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 46 +++++--------------
llvm/test/CodeGen/NVPTX/ctlz.ll | 4 +-
llvm/test/CodeGen/NVPTX/intrinsics.ll | 26 ++++++++++-
4 files changed, 40 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 9e8227de261f1..1cacab9528caa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -5113,7 +5113,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
DAG.getConstant(NVT.getSizeInBits() -
OVT.getSizeInBits(), dl, NVT));
}
- Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
+ Results.push_back(
+ DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1, SDNodeFlags::NoWrap));
break;
}
case ISD::CTLZ_ZERO_UNDEF: {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 0627ddf7d4ee9..b768725b04256 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -767,8 +767,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
{MVT::i16, MVT::i32, MVT::i64}, Legal);
+ setOperationAction({ISD::CTPOP, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i16,
+ Promote);
setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal);
- setOperationAction({ISD::CTPOP, ISD::CTLZ}, {MVT::i16, MVT::i64}, Custom);
+ setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i64, Custom);
setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
@@ -2743,40 +2745,17 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
return Op;
}
-static SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) {
+// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
+// Lower these into a node returning the correct type which is zero-extended
+// back to the correct size.
+static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) {
SDValue V = Op->getOperand(0);
- SDLoc DL(Op);
-
- if (V.getValueType() == MVT::i16) {
- SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
- SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Zext);
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, CT, SDNodeFlags::NoWrap);
- }
- if (V.getValueType() == MVT::i64) {
- SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, V);
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
- }
- llvm_unreachable("Unexpected CTPOP type to legalize");
-}
+ assert(V.getValueType() == MVT::i64 &&
+ "Unexpected CTLZ/CTPOP type to legalize");
-static SDValue lowerCTLZ(SDValue Op, SelectionDAG &DAG) {
- SDValue V = Op->getOperand(0);
SDLoc DL(Op);
-
- if (V.getValueType() == MVT::i16) {
- SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
- SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, Zext);
- SDValue Sub =
- DAG.getNode(ISD::ADD, DL, MVT::i32, CT,
- DAG.getConstant(APInt(32, -16, true), DL, MVT::i32),
- SDNodeFlags::NoSignedWrap);
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Sub, SDNodeFlags::NoWrap);
- }
- if (V.getValueType() == MVT::i64) {
- SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, V);
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
- }
- llvm_unreachable("Unexpected CTLZ type to legalize");
+ SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
}
SDValue
@@ -2865,9 +2844,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
// Used only for bf16 on SM80, where we select fma for non-ftz operation
return PromoteBinOpIfF32FTZ(Op, DAG);
case ISD::CTPOP:
- return lowerCTPOP(Op, DAG);
case ISD::CTLZ:
- return lowerCTLZ(Op, DAG);
+ return lowerCTLZCTPOP(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
index 4f414d600f248..1443e5c46346c 100644
--- a/llvm/test/CodeGen/NVPTX/ctlz.ll
+++ b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -130,8 +130,8 @@ define i16 @myctlz_ret16_2(i16 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0];
-; CHECK-NEXT: clz.b32 %r2, %r1;
-; CHECK-NEXT: add.s32 %r3, %r2, -16;
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: clz.b32 %r3, %r2;
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 57493558419c5..cc6af060d6c0a 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK32
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK64
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify %}
@@ -140,6 +140,28 @@ define i32 @test_popc64_trunc(i64 %a) {
; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and
; then converting back to i16.
define void @test_popc16(i16 %a, ptr %b) {
+; CHECK32-LABEL: test_popc16(
+; CHECK32: {
+; CHECK32-NEXT: .reg .b32 %r<4>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0:
+; CHECK32-NEXT: ld.param.u16 %r1, [test_popc16_param_0];
+; CHECK32-NEXT: popc.b32 %r2, %r1;
+; CHECK32-NEXT: ld.param.u32 %r3, [test_popc16_param_1];
+; CHECK32-NEXT: st.u16 [%r3], %r2;
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: test_popc16(
+; CHECK64: {
+; CHECK64-NEXT: .reg .b32 %r<3>;
+; CHECK64-NEXT: .reg .b64 %rd<2>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0:
+; CHECK64-NEXT: ld.param.u16 %r1, [test_popc16_param_0];
+; CHECK64-NEXT: popc.b32 %r2, %r1;
+; CHECK64-NEXT: ld.param.u64 %rd1, [test_popc16_param_1];
+; CHECK64-NEXT: st.u16 [%rd1], %r2;
+; CHECK64-NEXT: ret;
%val = call i16 @llvm.ctpop.i16(i16 %a)
store i16 %val, ptr %b
ret void
>From c5b25d310b30cd9efef6b7a88cb8188316f981c4 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 11 Mar 2025 16:57:38 +0000
Subject: [PATCH 4/4] update test
---
llvm/test/CodeGen/VE/Scalar/ctlz.ll | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/test/CodeGen/VE/Scalar/ctlz.ll b/llvm/test/CodeGen/VE/Scalar/ctlz.ll
index 602b9a86bf032..c2af9753f8bb6 100644
--- a/llvm/test/CodeGen/VE/Scalar/ctlz.ll
+++ b/llvm/test/CodeGen/VE/Scalar/ctlz.ll
@@ -200,7 +200,6 @@ define zeroext i32 @func32zx(i32 zeroext %p) {
; CHECK: # %bb.0:
; CHECK-NEXT: ldz %s0, %s0
; CHECK-NEXT: lea %s0, -32(, %s0)
-; CHECK-NEXT: and %s0, %s0, (32)0
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i32 @llvm.ctlz.i32(i32 %p, i1 false)
ret i32 %r
More information about the llvm-commits
mailing list