[clang] [llvm] [PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm (PR #82968)

Mon Feb 26 22:16:37 PST 2024

https://github.com/ecnelises updated https://github.com/llvm/llvm-project/pull/82968

>From a06fa5e18313ad50019d50006e34a6b8249d95cd Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan at cn.ibm.com>
Date: Mon, 26 Feb 2024 16:32:28 +0800
Subject: [PATCH 1/3] [PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm

These builtins are already there in Clang, however current codegen may
produce suboptimal results due to their complex behavior. Implement them
as intrinsics to ensure expected instructions are emitted.
---
 clang/lib/CodeGen/CGBuiltin.cpp               |  29 ++---
 .../PowerPC/builtins-ppc-xlcompat-rotate.c    |  24 ++--
 llvm/include/llvm/IR/IntrinsicsPowerPC.td     |  12 ++
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  52 ++++++++
 llvm/test/CodeGen/PowerPC/rldimi.ll           |  15 +++
 llvm/test/CodeGen/PowerPC/rlwimi.ll           | 123 ++++++++++++------
 llvm/test/CodeGen/PowerPC/rlwinm.ll           | 108 ++++++++++-----
 7 files changed, 259 insertions(+), 104 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 734eb5a035ca49..5d55be6e9e99df 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17080,37 +17080,24 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     }
     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
   }
-  // Rotate and insert under mask operation.
-  // __rldimi(rs, is, shift, mask)
-  // (rotl64(rs, shift) & mask) | (is & ~mask)
-  // __rlwimi(rs, is, shift, mask)
-  // (rotl(rs, shift) & mask) | (is & ~mask)
   case PPC::BI__builtin_ppc_rldimi:
   case PPC::BI__builtin_ppc_rlwimi: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     Value *Op1 = EmitScalarExpr(E->getArg(1));
     Value *Op2 = EmitScalarExpr(E->getArg(2));
     Value *Op3 = EmitScalarExpr(E->getArg(3));
-    llvm::Type *Ty = Op0->getType();
-    Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
-    if (BuiltinID == PPC::BI__builtin_ppc_rldimi)
-      Op2 = Builder.CreateZExt(Op2, Int64Ty);
-    Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op2});
-    Value *X = Builder.CreateAnd(Shift, Op3);
-    Value *Y = Builder.CreateAnd(Op1, Builder.CreateNot(Op3));
-    return Builder.CreateOr(X, Y);
-  }
-  // Rotate and insert under mask operation.
-  // __rlwnm(rs, shift, mask)
-  // rotl(rs, shift) & mask
+    return Builder.CreateCall(
+        CGM.getIntrinsic(BuiltinID == PPC::BI__builtin_ppc_rldimi
+                             ? Intrinsic::ppc_rldimi
+                             : Intrinsic::ppc_rlwimi),
+        {Op0, Op1, Op2, Op3});
+  }
   case PPC::BI__builtin_ppc_rlwnm: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     Value *Op1 = EmitScalarExpr(E->getArg(1));
     Value *Op2 = EmitScalarExpr(E->getArg(2));
-    llvm::Type *Ty = Op0->getType();
-    Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
-    Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op1});
-    return Builder.CreateAnd(Shift, Op2);
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_rlwnm),
+                              {Op0, Op1, Op2});
   }
   case PPC::BI__builtin_ppc_poppar4:
   case PPC::BI__builtin_ppc_poppar8: {
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
index d96bfb4621421e..b218547c00d931 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
@@ -16,11 +16,8 @@ void test_builtin_ppc_rldimi() {
   // CHECK:       %res = alloca i64, align 8
   // CHECK-NEXT:  [[RA:%[0-9]+]] = load i64, ptr @ull, align 8
   // CHECK-NEXT:  [[RB:%[0-9]+]] = load i64, ptr @ull, align 8
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i64 @llvm.fshl.i64(i64 [[RA]], i64 [[RA]], i64 63)
-  // CHECK-NEXT:  [[RD:%[0-9]+]] = and i64 [[RC]], 72057593769492480
-  // CHECK-NEXT:  [[RE:%[0-9]+]] = and i64 [[RB]], -72057593769492481
-  // CHECK-NEXT:  [[RF:%[0-9]+]] = or i64 [[RD]], [[RE]]
-  // CHECK-NEXT:  store i64 [[RF]], ptr %res, align 8
+  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i64 @llvm.ppc.rldimi(i64 [[RA]], i64 [[RB]], i32 63, i64 72057593769492480)
+  // CHECK-NEXT:  store i64 [[RC]], ptr %res, align 8
   // CHECK-NEXT:  ret void
 
   /*shift = 63, mask = 0x00FFFFFFF0000000 = 72057593769492480, ~mask = 0xFF0000000FFFFFFF = -72057593769492481*/
@@ -32,11 +29,8 @@ void test_builtin_ppc_rlwimi() {
   // CHECK:       %res = alloca i32, align 4
   // CHECK-NEXT:  [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
   // CHECK-NEXT:  [[RB:%[0-9]+]] = load i32, ptr @ui, align 4
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 31)
-  // CHECK-NEXT:  [[RD:%[0-9]+]] = and i32 [[RC]], 16776960
-  // CHECK-NEXT:  [[RE:%[0-9]+]] = and i32 [[RB]], -16776961
-  // CHECK-NEXT:  [[RF:%[0-9]+]] = or i32 [[RD]], [[RE]]
-  // CHECK-NEXT:  store i32 [[RF]], ptr %res, align 4
+  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i32 @llvm.ppc.rlwimi(i32 [[RA]], i32 [[RB]], i32 31, i32 16776960)
+  // CHECK-NEXT:  store i32 [[RC]], ptr %res, align 4
   // CHECK-NEXT:  ret void
 
   /*shift = 31, mask = 0xFFFF00 = 16776960, ~mask = 0xFFFFFFFFFF0000FF = -16776961*/
@@ -47,9 +41,8 @@ void test_builtin_ppc_rlwnm() {
   // CHECK-LABEL: test_builtin_ppc_rlwnm
   // CHECK:       %res = alloca i32, align 4
   // CHECK-NEXT:  [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
-  // CHECK-NEXT:  [[RB:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 31)
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = and i32 [[RB]], 511
-  // CHECK-NEXT:  store i32 [[RC]], ptr %res, align 4
+  // CHECK-NEXT:  [[RB:%[0-9]+]] = call i32 @llvm.ppc.rlwnm(i32 [[RA]], i32 31, i32 511)
+  // CHECK-NEXT:  store i32 [[RB]], ptr %res, align 4
   // CHECK-NEXT:  ret void
 
   /*shift = 31, mask = 0x1FF = 511*/
@@ -63,9 +56,8 @@ void test_builtin_ppc_rlwnm2(unsigned int shift) {
   // CHECK-NEXT:  store i32 %shift, ptr %shift.addr, align 4
   // CHECK-NEXT:  [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
   // CHECK-NEXT:  [[RB:%[0-9]+]] = load i32, ptr %shift.addr, align 4
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 [[RB]])
-  // CHECK-NEXT:  [[RD:%[0-9]+]] = and i32 [[RC]], 511
-  // CHECK-NEXT:  store i32 [[RD]], ptr %res, align 4
+  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i32 @llvm.ppc.rlwnm(i32 [[RA]], i32 [[RB]], i32 511)
+  // CHECK-NEXT:  store i32 [[RC]], ptr %res, align 4
   // CHECK-NEXT:  ret void
 
   /*mask = 0x1FF = 511*/
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index bfc2b17043bc79..ee9a04241ac2ec 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -182,6 +182,18 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_fctuwz
       : ClangBuiltin<"__builtin_ppc_fctuwz">,
         DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_ppc_rldimi
+      : ClangBuiltin<"__builtin_ppc_rldimi">,
+        DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_ppc_rlwimi
+      : ClangBuiltin<"__builtin_ppc_rlwimi">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_ppc_rlwnm
+      : ClangBuiltin<"__builtin_ppc_rlwnm">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   // XL compatible select functions
   // TODO: Add llvm_f128_ty support.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 51becf1d5b8584..f84addbf728ad0 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -641,6 +641,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  // setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
@@ -10722,6 +10723,20 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   return true;
 }
 
+bool isContiguousMask(const APInt &Val, unsigned &MB, unsigned &ME,
+                      unsigned BitWidth) {
+  unsigned MaskLen = 0;
+  if (Val.isShiftedMask(MB, MaskLen)) {
+    MB = (BitWidth - MB - MaskLen) % BitWidth;
+  } else if ((~Val).isShiftedMask(MB, MaskLen)) {
+    MB = (BitWidth - MB) % BitWidth;
+  } else {
+    return false;
+  }
+  ME = (MB + MaskLen - 1) % BitWidth;
+  return true;
+}
+
 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
 /// lower, do it, otherwise return null.
 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
@@ -10737,6 +10752,43 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getRegister(PPC::X13, MVT::i64);
     return DAG.getRegister(PPC::R2, MVT::i32);
 
+  case Intrinsic::ppc_rldimi: {
+    uint64_t SH = Op.getConstantOperandVal(3);
+    unsigned MB = 0, ME = 0;
+    if (!isContiguousMask(Op.getConstantOperandAPInt(4), MB, ME, 64) ||
+        ME != 63 - SH)
+      llvm_unreachable("invalid rldimi mask!");
+    return SDValue(DAG.getMachineNode(
+                       PPC::RLDIMI, dl, MVT::i64,
+                       {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                        DAG.getTargetConstant(MB, dl, MVT::i32)}),
+                   0);
+  }
+
+  case Intrinsic::ppc_rlwimi: {
+    unsigned MB = 0, ME = 0;
+    if (!isContiguousMask(Op.getConstantOperandAPInt(4), MB, ME, 32))
+      llvm_unreachable("invalid rlwimi mask!");
+    return SDValue(DAG.getMachineNode(
+                       PPC::RLWIMI, dl, MVT::i32,
+                       {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                        DAG.getTargetConstant(MB, dl, MVT::i32),
+                        DAG.getTargetConstant(ME, dl, MVT::i32)}),
+                   0);
+  }
+
+  case Intrinsic::ppc_rlwnm: {
+    unsigned MB = 0, ME = 0;
+    if (!isContiguousMask(Op.getConstantOperandAPInt(3), MB, ME, 32))
+      llvm_unreachable("invalid rlwnm mask!");
+    return SDValue(
+        DAG.getMachineNode(PPC::RLWNM, dl, MVT::i32,
+                           {Op.getOperand(1), Op.getOperand(2),
+                            DAG.getTargetConstant(MB, dl, MVT::i32),
+                            DAG.getTargetConstant(ME, dl, MVT::i32)}),
+        0);
+  }
+
   case Intrinsic::ppc_mma_disassemble_acc: {
     if (Subtarget.isISAFuture()) {
       EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
diff --git a/llvm/test/CodeGen/PowerPC/rldimi.ll b/llvm/test/CodeGen/PowerPC/rldimi.ll
index 4e26ddfc37f99e..322975f547c996 100644
--- a/llvm/test/CodeGen/PowerPC/rldimi.ll
+++ b/llvm/test/CodeGen/PowerPC/rldimi.ll
@@ -58,3 +58,18 @@ entry:
   %8 = or i64 %6, %7
   ret i64 %8
 }
+
+define i64 @rldimi_intrinsic(i64 %a) {
+; CHECK-LABEL: rldimi_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rldimi 3, 3, 8, 0
+; CHECK-NEXT:    rldimi 3, 3, 16, 0
+; CHECK-NEXT:    rldimi 3, 3, 32, 0
+; CHECK-NEXT:    blr
+  %r1 = call i64 @llvm.ppc.rldimi(i64 %a, i64 %a, i32 8, i64 -256)
+  %r2 = call i64 @llvm.ppc.rldimi(i64 %r1, i64 %r1, i32 16, i64 -65536)
+  %r3 = call i64 @llvm.ppc.rldimi(i64 %r2, i64 %r2, i32 32, i64 -4294967296)
+  ret i64 %r3
+}
+
+declare i64 @llvm.ppc.rldimi(i64, i64, i32 immarg, i64 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/rlwimi.ll b/llvm/test/CodeGen/PowerPC/rlwimi.ll
index e701236b840b2c..8b126cd3393c10 100644
--- a/llvm/test/CodeGen/PowerPC/rlwimi.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwimi.ll
@@ -1,70 +1,117 @@
-; All of these ands and shifts should be folded into rlwimi's
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | not grep and
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | grep rlwimi | count 8
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
 
 define i32 @test1(i32 %x, i32 %y) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 4, 3, 16, 0, 15
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = shl i32 %x, 16		; <i32> [#uses=1]
-	%tmp.7 = and i32 %y, 65535		; <i32> [#uses=1]
-	%tmp.9 = or i32 %tmp.7, %tmp.3		; <i32> [#uses=1]
-	ret i32 %tmp.9
+  %tmp.3 = shl i32 %x, 16
+  %tmp.7 = and i32 %y, 65535
+  %tmp.9 = or i32 %tmp.7, %tmp.3
+  ret i32 %tmp.9
 }
 
 define i32 @test2(i32 %x, i32 %y) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 16, 0, 15
+; CHECK-NEXT:    blr
 entry:
-	%tmp.7 = and i32 %x, 65535		; <i32> [#uses=1]
-	%tmp.3 = shl i32 %y, 16		; <i32> [#uses=1]
-	%tmp.9 = or i32 %tmp.7, %tmp.3		; <i32> [#uses=1]
-	ret i32 %tmp.9
+  %tmp.7 = and i32 %x, 65535
+  %tmp.3 = shl i32 %y, 16
+  %tmp.9 = or i32 %tmp.7, %tmp.3
+  ret i32 %tmp.9
 }
 
 define i32 @test3(i32 %x, i32 %y) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 4, 3, 16, 16, 31
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = lshr i32 %x, 16		; <i32> [#uses=1]
-	%tmp.6 = and i32 %y, -65536		; <i32> [#uses=1]
-	%tmp.7 = or i32 %tmp.6, %tmp.3		; <i32> [#uses=1]
-	ret i32 %tmp.7
+  %tmp.3 = lshr i32 %x, 16
+  %tmp.6 = and i32 %y, -65536
+  %tmp.7 = or i32 %tmp.6, %tmp.3
+  ret i32 %tmp.7
 }
 
 define i32 @test4(i32 %x, i32 %y) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 16, 16, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.6 = and i32 %x, -65536		; <i32> [#uses=1]
-	%tmp.3 = lshr i32 %y, 16		; <i32> [#uses=1]
-	%tmp.7 = or i32 %tmp.6, %tmp.3		; <i32> [#uses=1]
-	ret i32 %tmp.7
+  %tmp.6 = and i32 %x, -65536
+  %tmp.3 = lshr i32 %y, 16
+  %tmp.7 = or i32 %tmp.6, %tmp.3
+  ret i32 %tmp.7
 }
 
 define i32 @test5(i32 %x, i32 %y) {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 4, 3, 1, 0, 15
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = shl i32 %x, 1		; <i32> [#uses=1]
-	%tmp.4 = and i32 %tmp.3, -65536		; <i32> [#uses=1]
-	%tmp.7 = and i32 %y, 65535		; <i32> [#uses=1]
-	%tmp.9 = or i32 %tmp.4, %tmp.7		; <i32> [#uses=1]
-	ret i32 %tmp.9
+  %tmp.3 = shl i32 %x, 1
+  %tmp.4 = and i32 %tmp.3, -65536
+  %tmp.7 = and i32 %y, 65535
+  %tmp.9 = or i32 %tmp.4, %tmp.7
+  ret i32 %tmp.9
 }
 
 define i32 @test6(i32 %x, i32 %y) {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 1, 0, 15
+; CHECK-NEXT:    blr
 entry:
-	%tmp.7 = and i32 %x, 65535		; <i32> [#uses=1]
-	%tmp.3 = shl i32 %y, 1		; <i32> [#uses=1]
-	%tmp.4 = and i32 %tmp.3, -65536		; <i32> [#uses=1]
-	%tmp.9 = or i32 %tmp.4, %tmp.7		; <i32> [#uses=1]
-	ret i32 %tmp.9
+  %tmp.7 = and i32 %x, 65535
+  %tmp.3 = shl i32 %y, 1
+  %tmp.4 = and i32 %tmp.3, -65536
+  %tmp.9 = or i32 %tmp.4, %tmp.7
+  ret i32 %tmp.9
 }
 
 define i32 @test7(i32 %x, i32 %y) {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andis. 3, 3, 65535
+; CHECK-NEXT:    rldimi 3, 4, 0, 48
+; CHECK-NEXT:    blr
 entry:
-	%tmp.2 = and i32 %x, -65536		; <i32> [#uses=1]
-	%tmp.5 = and i32 %y, 65535		; <i32> [#uses=1]
-	%tmp.7 = or i32 %tmp.5, %tmp.2		; <i32> [#uses=1]
-	ret i32 %tmp.7
+  %tmp.2 = and i32 %x, -65536
+  %tmp.5 = and i32 %y, 65535
+  %tmp.7 = or i32 %tmp.5, %tmp.2
+  ret i32 %tmp.7
 }
 
 define i32 @test8(i32 %bar) {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 3, 1, 30, 30
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = shl i32 %bar, 1		; <i32> [#uses=1]
-	%tmp.4 = and i32 %tmp.3, 2		; <i32> [#uses=1]
-	%tmp.6 = and i32 %bar, -3		; <i32> [#uses=1]
-	%tmp.7 = or i32 %tmp.4, %tmp.6		; <i32> [#uses=1]
-	ret i32 %tmp.7
+  %tmp.3 = shl i32 %bar, 1
+  %tmp.4 = and i32 %tmp.3, 2
+  %tmp.6 = and i32 %bar, -3
+  %tmp.7 = or i32 %tmp.4, %tmp.6
+  ret i32 %tmp.7
 }
+
+define i32 @test9(i32 %a, i32 %b) {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 8, 20, 26
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 8, i32 4064)
+  ret i32 %r
+}
+
+declare i32 @llvm.ppc.rlwimi(i32, i32, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/rlwinm.ll b/llvm/test/CodeGen/PowerPC/rlwinm.ll
index 2f3b3bf003cf65..73e4b5f6b7ff60 100644
--- a/llvm/test/CodeGen/PowerPC/rlwinm.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwinm.ll
@@ -1,61 +1,111 @@
-; All of these ands and shifts should be folded into rlwimi's
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -o %t
-; RUN: not grep and %t
-; RUN: not grep srawi %t
-; RUN: not grep srwi %t
-; RUN: not grep slwi %t
-; RUN: grep rlwinm %t | count 8
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
 
 define i32 @test1(i32 %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 0, 4, 19
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, 268431360		; <i32> [#uses=1]
-	ret i32 %tmp.1
+  %tmp.1 = and i32 %a, 268431360
+  ret i32 %tmp.1
 }
 
 define i32 @test2(i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rldicl 3, 3, 36, 24
+; CHECK-NEXT:    rldicl 3, 3, 28, 32
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, -268435441		; <i32> [#uses=1]
-	ret i32 %tmp.1
+  %tmp.1 = and i32 %a, -268435441
+  ret i32 %tmp.1
 }
 
 define i32 @test3(i32 %a) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.2 = ashr i32 %a, 8		; <i32> [#uses=1]
-	%tmp.3 = and i32 %tmp.2, 255		; <i32> [#uses=1]
-	ret i32 %tmp.3
+  %tmp.2 = ashr i32 %a, 8
+  %tmp.3 = and i32 %tmp.2, 255
+  ret i32 %tmp.3
 }
 
 define i32 @test4(i32 %a) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = lshr i32 %a, 8		; <i32> [#uses=1]
-	%tmp.4 = and i32 %tmp.3, 255		; <i32> [#uses=1]
-	ret i32 %tmp.4
+  %tmp.3 = lshr i32 %a, 8
+  %tmp.4 = and i32 %tmp.3, 255
+  ret i32 %tmp.4
 }
 
 define i32 @test5(i32 %a) {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 8, 0, 8
+; CHECK-NEXT:    blr
 entry:
-	%tmp.2 = shl i32 %a, 8		; <i32> [#uses=1]
-	%tmp.3 = and i32 %tmp.2, -8388608		; <i32> [#uses=1]
-	ret i32 %tmp.3
+  %tmp.2 = shl i32 %a, 8
+  %tmp.3 = and i32 %tmp.2, -8388608
+  ret i32 %tmp.3
 }
 
 define i32 @test6(i32 %a) {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, 65280		; <i32> [#uses=1]
-	%tmp.2 = ashr i32 %tmp.1, 8		; <i32> [#uses=1]
-	ret i32 %tmp.2
+  %tmp.1 = and i32 %a, 65280
+  %tmp.2 = ashr i32 %tmp.1, 8
+  ret i32 %tmp.2
 }
 
 define i32 @test7(i32 %a) {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, 65280		; <i32> [#uses=1]
-	%tmp.2 = lshr i32 %tmp.1, 8		; <i32> [#uses=1]
-	ret i32 %tmp.2
+  %tmp.1 = and i32 %a, 65280
+  %tmp.2 = lshr i32 %tmp.1, 8
+  ret i32 %tmp.2
 }
 
 define i32 @test8(i32 %a) {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 8, 0, 7
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, 16711680		; <i32> [#uses=1]
-	%tmp.2 = shl i32 %tmp.1, 8		; <i32> [#uses=1]
-	ret i32 %tmp.2
+  %tmp.1 = and i32 %a, 16711680
+  %tmp.2 = shl i32 %tmp.1, 8
+  ret i32 %tmp.2
 }
+
+define i32 @test9(i32 %a, i32 %s) {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwnm 3, 3, 4, 23, 31
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwnm(i32 %a, i32 %s, i32 511)
+  ret i32 %r
+}
+
+define i32 @test10(i32 %a) {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 31, 23, 31
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwnm(i32 %a, i32 31, i32 511)
+  ret i32 %r
+}
+
+declare i32 @llvm.ppc.rlwnm(i32, i32, i32 immarg)

>From d9c9b4eb91ca3cec0bc469364914706b89ab1eeb Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan at cn.ibm.com>
Date: Tue, 27 Feb 2024 14:07:55 +0800
Subject: [PATCH 2/3] Use isRunOfOnes

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 28 ++++++---------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f84addbf728ad0..178904d1e38d29 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCISelLowering.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPC.h"
 #include "PPCCCState.h"
@@ -641,7 +642,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  // setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
@@ -10723,20 +10723,6 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   return true;
 }
 
-bool isContiguousMask(const APInt &Val, unsigned &MB, unsigned &ME,
-                      unsigned BitWidth) {
-  unsigned MaskLen = 0;
-  if (Val.isShiftedMask(MB, MaskLen)) {
-    MB = (BitWidth - MB - MaskLen) % BitWidth;
-  } else if ((~Val).isShiftedMask(MB, MaskLen)) {
-    MB = (BitWidth - MB) % BitWidth;
-  } else {
-    return false;
-  }
-  ME = (MB + MaskLen - 1) % BitWidth;
-  return true;
-}
-
 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
 /// lower, do it, otherwise return null.
 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
@@ -10755,9 +10741,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::ppc_rldimi: {
     uint64_t SH = Op.getConstantOperandVal(3);
     unsigned MB = 0, ME = 0;
-    if (!isContiguousMask(Op.getConstantOperandAPInt(4), MB, ME, 64) ||
+    if (!isRunOfOnes64(Op.getConstantOperandVal(4), MB, ME) ||
         ME != 63 - SH)
-      llvm_unreachable("invalid rldimi mask!");
+      report_fatal_error("invalid rldimi mask!");
     return SDValue(DAG.getMachineNode(
                        PPC::RLDIMI, dl, MVT::i64,
                        {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
@@ -10767,8 +10753,8 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   case Intrinsic::ppc_rlwimi: {
     unsigned MB = 0, ME = 0;
-    if (!isContiguousMask(Op.getConstantOperandAPInt(4), MB, ME, 32))
-      llvm_unreachable("invalid rlwimi mask!");
+    if (!isRunOfOnes(Op.getConstantOperandVal(4), MB, ME))
+      report_fatal_error("invalid rlwimi mask!");
     return SDValue(DAG.getMachineNode(
                        PPC::RLWIMI, dl, MVT::i32,
                        {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
@@ -10779,8 +10765,8 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   case Intrinsic::ppc_rlwnm: {
     unsigned MB = 0, ME = 0;
-    if (!isContiguousMask(Op.getConstantOperandAPInt(3), MB, ME, 32))
-      llvm_unreachable("invalid rlwnm mask!");
+    if (!isRunOfOnes(Op.getConstantOperandVal(3), MB, ME))
+      report_fatal_error("invalid rlwnm mask!");
     return SDValue(
         DAG.getMachineNode(PPC::RLWNM, dl, MVT::i32,
                            {Op.getOperand(1), Op.getOperand(2),

>From 954f9b65907e1fd8b1726d5988f129290fe60819 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan at cn.ibm.com>
Date: Tue, 27 Feb 2024 14:16:06 +0800
Subject: [PATCH 3/3] Make clang-format happy

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 178904d1e38d29..7a9b1520ec9f71 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10741,8 +10741,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::ppc_rldimi: {
     uint64_t SH = Op.getConstantOperandVal(3);
     unsigned MB = 0, ME = 0;
-    if (!isRunOfOnes64(Op.getConstantOperandVal(4), MB, ME) ||
-        ME != 63 - SH)
+    if (!isRunOfOnes64(Op.getConstantOperandVal(4), MB, ME) || ME != 63 - SH)
       report_fatal_error("invalid rldimi mask!");
     return SDValue(DAG.getMachineNode(
                        PPC::RLDIMI, dl, MVT::i64,