[llvm] 9b87ad3 - [LoongArch] Implement OR combination to generate bstrins.w/d

Thu Jul 14 02:21:31 PDT 2022

Author: Weining Lu
Date: 2022-07-14T17:20:43+08:00
New Revision: 9b87ad33c1fa0ecf09bbdc5cc2384ae081101269

URL: https://github.com/llvm/llvm-project/commit/9b87ad33c1fa0ecf09bbdc5cc2384ae081101269
DIFF: https://github.com/llvm/llvm-project/commit/9b87ad33c1fa0ecf09bbdc5cc2384ae081101269.diff

LOG: [LoongArch] Implement OR combination to generate bstrins.w/d

Differential Revision: https://reviews.llvm.org/D129357

Added: 
    llvm/test/CodeGen/LoongArch/bstrins_d.ll
    llvm/test/CodeGen/LoongArch/bstrins_w.ll

Modified: 
    llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
    llvm/lib/Target/LoongArch/LoongArchISelLowering.h
    llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
    llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 5c2652114375d..4acf90bd97884 100644

--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 
@@ -102,6 +103,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   setMinFunctionAlignment(FunctionAlignment);
 
   setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::SRL);
 }
 
@@ -502,6 +504,224 @@ static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const LoongArchSubtarget &Subtarget) {
+  MVT GRLenVT = Subtarget.getGRLenVT();
+  EVT ValTy = N->getValueType(0);
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  ConstantSDNode *CN0, *CN1;
+  SDLoc DL(N);
+  unsigned ValBits = ValTy.getSizeInBits();
+  unsigned MaskIdx0, MaskLen0, MaskIdx1, MaskLen1;
+  unsigned Shamt;
+  bool SwapAndRetried = false;
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (ValBits != 32 && ValBits != 64)
+    return SDValue();
+
+Retry:
+  // 1st pattern to match BSTRINS:
+  //  R = or (and X, mask0), (and (shl Y, lsb), mask1)
+  //  where mask1 = (2**size - 1) << lsb, mask0 = ~mask1
+  //  =>
+  //  R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1)
+  if (N0.getOpcode() == ISD::AND &&
+      (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+      isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) &&
+      N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL &&
+      (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+      isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) &&
+      MaskIdx0 == MaskIdx1 && MaskLen0 == MaskLen1 &&
+      (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) &&
+      (Shamt = CN1->getZExtValue()) == MaskIdx0 &&
+      (MaskIdx0 + MaskLen0 <= ValBits)) {
+    LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 1\n");
+    return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+                       N1.getOperand(0).getOperand(0),
+                       DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+                       DAG.getConstant(MaskIdx0, DL, GRLenVT));
+  }
+
+  // 2nd pattern to match BSTRINS:
+  //  R = or (and X, mask0), (shl (and Y, mask1), lsb)
+  //  where mask1 = (2**size - 1), mask0 = ~(mask1 << lsb)
+  //  =>
+  //  R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1)
+  if (N0.getOpcode() == ISD::AND &&
+      (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+      isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) &&
+      N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND &&
+      (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+      (Shamt = CN1->getZExtValue()) == MaskIdx0 &&
+      (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) &&
+      isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) &&
+      MaskLen0 == MaskLen1 && MaskIdx1 == 0 &&
+      (MaskIdx0 + MaskLen0 <= ValBits)) {
+    LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 2\n");
+    return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+                       N1.getOperand(0).getOperand(0),
+                       DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+                       DAG.getConstant(MaskIdx0, DL, GRLenVT));
+  }
+
+  // 3rd pattern to match BSTRINS:
+  //  R = or (and X, mask0), (and Y, mask1)
+  //  where ~mask0 = (2**size - 1) << lsb, mask0 & mask1 = 0
+  //  =>
+  //  R = BSTRINS X, (shr (and Y, mask1), lsb), msb, lsb
+  //  where msb = lsb + size - 1
+  if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
+      (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+      isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) &&
+      (MaskIdx0 + MaskLen0 <= 64) &&
+      (CN1 = dyn_cast<ConstantSDNode>(N1->getOperand(1))) &&
+      (CN1->getSExtValue() & CN0->getSExtValue()) == 0) {
+    LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 3\n");
+    return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+                       DAG.getNode(ISD::SRL, DL, N1->getValueType(0), N1,
+                                   DAG.getConstant(MaskIdx0, DL, GRLenVT)),
+                       DAG.getConstant(ValBits == 32
+                                           ? (MaskIdx0 + (MaskLen0 & 31) - 1)
+                                           : (MaskIdx0 + MaskLen0 - 1),
+                                       DL, GRLenVT),
+                       DAG.getConstant(MaskIdx0, DL, GRLenVT));
+  }
+
+  // 4th pattern to match BSTRINS:
+  //  R = or (and X, mask), (shl Y, shamt)
+  //  where mask = (2**shamt - 1)
+  //  =>
+  //  R = BSTRINS X, Y, ValBits - 1, shamt
+  //  where ValBits = 32 or 64
+  if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::SHL &&
+      (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+      isShiftedMask_64(CN0->getZExtValue(), MaskIdx0, MaskLen0) &&
+      MaskIdx0 == 0 && (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+      (Shamt = CN1->getZExtValue()) == MaskLen0 &&
+      (MaskIdx0 + MaskLen0 <= ValBits)) {
+    LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 4\n");
+    return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+                       N1.getOperand(0),
+                       DAG.getConstant((ValBits - 1), DL, GRLenVT),
+                       DAG.getConstant(Shamt, DL, GRLenVT));
+  }
+
+  // 5th pattern to match BSTRINS:
+  //  R = or (and X, mask), const
+  //  where ~mask = (2**size - 1) << lsb, mask & const = 0
+  //  =>
+  //  R = BSTRINS X, (const >> lsb), msb, lsb
+  //  where msb = lsb + size - 1
+  if (N0.getOpcode() == ISD::AND &&
+      (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+      isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) &&
+      (CN1 = dyn_cast<ConstantSDNode>(N1)) &&
+      (CN1->getSExtValue() & CN0->getSExtValue()) == 0) {
+    LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 5\n");
+    return DAG.getNode(
+        LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+        DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy),
+        DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+        DAG.getConstant(MaskIdx0, DL, GRLenVT));
+  }
+
+  // 6th pattern.
+  // a = b | ((c & mask) << shamt), where all positions in b to be overwritten
+  // by the incoming bits are known to be zero.
+  // =>
+  // a = BSTRINS b, c, shamt + MaskLen - 1, shamt
+  //
+  // Note that the 1st pattern is a special situation of the 6th, i.e. the 6th
+  // pattern is more common than the 1st. So we put the 1st before the 6th in
+  // order to match as many nodes as possible.
+  ConstantSDNode *CNMask, *CNShamt;
+  unsigned MaskIdx, MaskLen;
+  if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND &&
+      (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) &&
+      isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) &&
+      MaskIdx == 0 && (CNShamt = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+      CNShamt->getZExtValue() + MaskLen <= ValBits) {
+    Shamt = CNShamt->getZExtValue();
+    APInt ShMask(ValBits, CNMask->getZExtValue() << Shamt);
+    if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) {
+      LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 6\n");
+      return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0,
+                         N1.getOperand(0).getOperand(0),
+                         DAG.getConstant(Shamt + MaskLen - 1, DL, GRLenVT),
+                         DAG.getConstant(Shamt, DL, GRLenVT));
+    }
+  }
+
+  // 7th pattern.
+  // a = b | ((c << shamt) & shifted_mask), where all positions in b to be
+  // overwritten by the incoming bits are known to be zero.
+  // =>
+  // a = BSTRINS b, c, MaskIdx + MaskLen - 1, MaskIdx
+  //
+  // Similarly, the 7th pattern is more common than the 2nd. So we put the 2nd
+  // before the 7th in order to match as many nodes as possible.
+  if (N1.getOpcode() == ISD::AND &&
+      (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+      isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) &&
+      N1.getOperand(0).getOpcode() == ISD::SHL &&
+      (CNShamt = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) &&
+      CNShamt->getZExtValue() == MaskIdx) {
+    APInt ShMask(ValBits, CNMask->getZExtValue());
+    if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) {
+      LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 7\n");
+      return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0,
+                         N1.getOperand(0).getOperand(0),
+                         DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT),
+                         DAG.getConstant(MaskIdx, DL, GRLenVT));
+    }
+  }
+
+  // (or a, b) and (or b, a) are equivalent, so swap the operands and retry.
+  if (!SwapAndRetried) {
+    std::swap(N0, N1);
+    SwapAndRetried = true;
+    goto Retry;
+  }
+
+  SwapAndRetried = false;
+Retry2:
+  // 8th pattern.
+  // a = b | (c & shifted_mask), where all positions in b to be overwritten by
+  // the incoming bits are known to be zero.
+  // =>
+  // a = BSTRINS b, c >> MaskIdx, MaskIdx + MaskLen - 1, MaskIdx
+  //
+  // Similarly, the 8th pattern is more common than the 4th and 5th patterns. So
+  // we put it here in order to match as many nodes as possible or generate less
+  // instructions.
+  if (N1.getOpcode() == ISD::AND &&
+      (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+      isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen)) {
+    APInt ShMask(ValBits, CNMask->getZExtValue());
+    if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) {
+      LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 8\n");
+      return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0,
+                         DAG.getNode(ISD::SRL, DL, N1->getValueType(0),
+                                     N1->getOperand(0),
+                                     DAG.getConstant(MaskIdx, DL, GRLenVT)),
+                         DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT),
+                         DAG.getConstant(MaskIdx, DL, GRLenVT));
+    }
+  }
+  // Swap N0/N1 and retry.
+  if (!SwapAndRetried) {
+    std::swap(N0, N1);
+    SwapAndRetried = true;
+    goto Retry2;
+  }
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -510,6 +730,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   case ISD::AND:
     return performANDCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:
+    return performORCombine(N, DAG, DCI, Subtarget);
   case ISD::SRL:
     return performSRLCombine(N, DAG, DCI, Subtarget);
   }
@@ -579,6 +801,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(SLL_W)
     NODE_NAME_CASE(SRA_W)
     NODE_NAME_CASE(SRL_W)
+    NODE_NAME_CASE(BSTRINS)
     NODE_NAME_CASE(BSTRPICK)
     NODE_NAME_CASE(MOVGR2FR_W_LA64)
     NODE_NAME_CASE(MOVFR2GR_S_LA64)

diff  --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index be58660893eb3..279550482675e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -41,6 +41,7 @@ enum NodeType : unsigned {
 
   FTINT,
 
+  BSTRINS,
   BSTRPICK,
 
 };

diff  --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index e3286f6590ccb..d07d086bd7da8 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -26,6 +26,11 @@ def SDT_LoongArchIntBinOpW : SDTypeProfile<1, 2, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>
 ]>;
 
+def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [
+  SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>,
+  SDTCisSameAs<3, 4>
+]>;
+
 def SDT_LoongArchBStrPick: SDTypeProfile<1, 3, [
   SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<2, 3>
 ]>;
@@ -46,6 +51,8 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
 def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
 def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
+def loongarch_bstrins
+    : SDNode<"LoongArchISD::BSTRINS", SDT_LoongArchBStrIns>;
 def loongarch_bstrpick
     : SDNode<"LoongArchISD::BSTRPICK", SDT_LoongArchBStrPick>;
 
@@ -777,15 +784,21 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in
 def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
 
-/// BSTRPICK
+/// BSTRINS and BSTRPICK
 
-let Predicates = [IsLA32] in
+let Predicates = [IsLA32] in {
+def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd),
+          (BSTRINS_W GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>;
 def : Pat<(loongarch_bstrpick GPR:$rj, uimm5:$msbd, uimm5:$lsbd),
           (BSTRPICK_W GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>;
+} // Predicates = [IsLA32]
 
-let Predicates = [IsLA64] in
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
+          (BSTRINS_D GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>;
 def : Pat<(loongarch_bstrpick GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
           (BSTRPICK_D GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>;
+} // Predicates = [IsLA64]
 
 /// Loads
 

diff  --git a/llvm/test/CodeGen/LoongArch/bstrins_d.ll b/llvm/test/CodeGen/LoongArch/bstrins_d.ll
new file mode 100644
index 0000000000000..342e044c7a7be
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/bstrins_d.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s
+
+;; Test generation of the bstrins.d instruction.
+;; There are 8 patterns that can be matched to bstrins.d. See performORCombine
+;; for details.
+
+;; Pattern 1
+;; R = or (and X, mask0), (and (shl Y, lsb), mask1)
+;; =>
+;; R = BSTRINS X, Y, msb, lsb
+define i64 @pat1(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: pat1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.d $a0, $a1, 39, 16
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i64 %a, -1099511562241  ; 0xffffff000000ffff
+  %shl = shl i64 %b, 16
+  %and2 = and i64 %shl, 1099511562240 ; 0x000000ffffff0000
+  %or = or i64 %and1, %and2
+  ret i64 %or
+}
+
+define i64 @pat1_swap(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: pat1_swap:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.d $a0, $a1, 39, 16
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i64 %a, -1099511562241  ; 0xffffff000000ffff
+  %shl = shl i64 %b, 16
+  %and2 = and i64 %shl, 1099511562240 ; 0x000000ffffff0000
+  %or = or i64 %and2, %and1
+  ret i64 %or
+}
+
+;; Pattern 2
+;; R = or (and X, mask0), (shl (and Y, mask1), lsb)
+;; =>
+;; R = BSTRINS X, Y, msb, lsb
+define i64 @pat2(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: pat2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.d $a0, $a1, 39, 16
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i64 %a, -1099511562241 ; 0xffffff000000ffff
+  %and2 = and i64 %b, 16777215       ; 0x0000000000ffffff
+  %shl = shl i64 %and2, 16
+  %or = or i64 %and1, %shl
+  ret i64 %or
+}
+
+define i64 @pat2_swap(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: pat2_swap:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.d $a0, $a1, 39, 16
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i64 %a, -1099511562241 ; 0xffffff000000ffff
+  %and2 = and i64 %b, 16777215       ; 0x0000000000ffffff
+  %shl = shl i64 %and2, 16
+  %or = or i64 %shl, %and1
+  ret i64 %or
+}
+
+;; Pattern 3
+;; R = or (and X, mask0), (and Y, mask1)
+;; =>
+;; R = BSTRINS X, (srl (and Y, mask1), lsb), msb, lsb
+define i64 @pat3(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: pat3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi $a1, $a1, 288
+; CHECK-NEXT:    srli.d $a1, $a1, 4
+; CHECK-NEXT:    bstrins.d $a0, $a1, 11, 4
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i64 %a, -4081 ; 0xfffffffffffff00f
+  %and2 = and i64 %b, 288   ; 0x0000000000000120
+  %or = or i64 %and1, %and2
+  ret i64 %or
+}
+
+define i64 @pat3_swap(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: pat3_swap:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi $a1, $a1, 288
+; CHECK-NEXT:    srli.d $a1, $a1, 4
+; CHECK-NEXT:    bstrins.d $a0, $a1, 11, 4
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i64 %a, -4081 ; 0xfffffffffffff00f
+  %and2 = and i64 %b, 288   ; 0x0000000000000120
+  %or = or i64 %and2, %and1
+  ret i64 %or
+}
+
+;; Pattern 4
+;; R = or (and X, mask), (shl Y, shamt)
+;; =>
+;; R = BSTRINS X, Y, 63, shamt
+define i64 @pat4(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: pat4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.d $a0, $a1, 63, 8
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i64 %a, 255
+  %shl = shl i64 %b, 8
+  %or = or i64 %and, %shl
+  ret i64 %or
+}
+
+define i64 @pat4_swap(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: pat4_swap:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.d $a0, $a1, 63, 8
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i64 %a, 255
+  %shl = shl i64 %b, 8
+  %or = or i64 %shl, %and
+  ret i64 %or
+}
+
+;; Pattern 5
+;; R = or (and X, mask0), const
+;; =>
+;; R = BSTRINS X, (const >> lsb), msb, lsb
+define i64 @pat5(i64 %a) nounwind {
+; CHECK-LABEL: pat5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 74565
+; CHECK-NEXT:    ori $a1, $a1, 1656
+; CHECK-NEXT:    bstrins.d $a0, $a1, 47, 16
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i64 %a, 18446462598732906495 ; 0xffff00000000ffff
+  %or = or i64 %and, 20015998304256       ; 0x0000123456780000
+  ret i64 %or
+}
+
+;; Pattern 6: a = b | ((c & mask) << shamt)
+;; In this testcase b is 0x123456000000789a, but in fact we do not require b
+;; being a constant. As long as all positions in b to be overwritten by the
+;; incoming bits are known to be zero, the pattern could be matched.
+define i64 @pat6(i64 %c) nounwind {
+; CHECK-LABEL: pat6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 7
+; CHECK-NEXT:    ori $a1, $a1, 2202
+; CHECK-NEXT:    lu32i.d $a1, 284160
+; CHECK-NEXT:    lu52i.d $a1, $a1, 291
+; CHECK-NEXT:    bstrins.d $a1, $a0, 39, 16
+; CHECK-NEXT:    move $a0, $a1
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i64 %c, 16777215            ; 0x0000000000ffffff
+  %shl = shl i64 %and, 16
+  %or = or i64 %shl, 1311767949471676570 ; 0x123456000000789a
+  ret i64 %or
+}
+
+;; Pattern 7: a = b | ((c << shamt) & shifted_mask)
+;; Similar to pattern 6.
+define i64 @pat7(i64 %c) nounwind {
+; CHECK-LABEL: pat7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 7
+; CHECK-NEXT:    ori $a1, $a1, 2202
+; CHECK-NEXT:    lu32i.d $a1, 284160
+; CHECK-NEXT:    lu52i.d $a1, $a1, 291
+; CHECK-NEXT:    bstrins.d $a1, $a0, 39, 16
+; CHECK-NEXT:    move $a0, $a1
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %shl = shl i64 %c, 16
+  %and = and i64 %shl, 1099511562240     ; 0x000000ffffff0000
+  %or = or i64 %and, 1311767949471676570 ; 0x123456000000789a
+  ret i64 %or
+}
+
+;; Pattern 8: a = b | (c & shifted_mask)
+;; Similar to pattern 7 but without shift to c.
+define i64 @pat8(i64 %c) nounwind {
+; CHECK-LABEL: pat8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srli.d $a1, $a0, 16
+; CHECK-NEXT:    lu12i.w $a0, 7
+; CHECK-NEXT:    ori $a0, $a0, 2202
+; CHECK-NEXT:    lu32i.d $a0, 284160
+; CHECK-NEXT:    lu52i.d $a0, $a0, 291
+; CHECK-NEXT:    bstrins.d $a0, $a1, 39, 16
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i64 %c, 1099511562240       ; 0x000000ffffff0000
+  %or = or i64 %and, 1311767949471676570 ; 0x123456000000789a
+  ret i64 %or
+}
+
+;; Test that bstrins.d is not generated because constant OR operand
+;; doesn't fit into bits cleared by constant AND operand.
+define i64 @no_bstrins_d(i64 %a) nounwind {
+; CHECK-LABEL: no_bstrins_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 354185
+; CHECK-NEXT:    lu32i.d $a1, 4660
+; CHECK-NEXT:    or $a0, $a0, $a1
+; CHECK-NEXT:    lu12i.w $a1, 354191
+; CHECK-NEXT:    ori $a1, $a1, 4095
+; CHECK-NEXT:    lu32i.d $a1, -60876
+; CHECK-NEXT:    and $a0, $a0, $a1
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i64 %a, 18446462598732906495 ; 0xffff00000000ffff
+  %or = or i64 %and, 20015998341120       ; 0x0000123456789000
+  ret i64 %or
+}

diff  --git a/llvm/test/CodeGen/LoongArch/bstrins_w.ll b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
new file mode 100644
index 0000000000000..47c4d826c2ee5
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s
+
+;; Test generation of the bstrins.w instruction.
+;; There are 8 patterns that can be matched to bstrins.w. See performORCombine
+;; for details.
+
+;; Pattern 1
+;; R = or (and X, mask0), (and (shl Y, lsb), mask1)
+;; =>
+;; R = BSTRINS X, Y, msb, lsb
+define i32 @pat1(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.w $a0, $a1, 19, 8
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i32 %a, -1048321  ; 0xfff000ff
+  %shl = shl i32 %b, 8
+  %and2 = and i32 %shl, 1048320 ; 0x000fff00
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+define i32 @pat1_swap(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat1_swap:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.w $a0, $a1, 19, 8
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i32 %a, -1048321  ; 0xfff000ff
+  %shl = shl i32 %b, 8
+  %and2 = and i32 %shl, 1048320 ; 0x000fff00
+  %or = or i32 %and2, %and1
+  ret i32 %or
+}
+
+;; Pattern 2
+;; R = or (and X, mask0), (shl (and Y, mask1), lsb)
+;; =>
+;; R = BSTRINS X, Y, msb, lsb
+define i32 @pat2(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.w $a0, $a1, 19, 8
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i32 %a, -1048321 ; 0xfff000ff
+  %and2 = and i32 %b, 4095     ; 0x00000fff
+  %shl = shl i32 %and2, 8
+  %or = or i32 %and1, %shl
+  ret i32 %or
+}
+
+define i32 @pat2_swap(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat2_swap:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.w $a0, $a1, 19, 8
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i32 %a, -1048321 ; 0xfff000ff
+  %and2 = and i32 %b, 4095     ; 0x00000fff
+  %shl = shl i32 %and2, 8
+  %or = or i32 %shl, %and1
+  ret i32 %or
+}
+
+;; Pattern 3
+;; R = or (and X, mask0), (and Y, mask1)
+;; =>
+;; R = BSTRINS X, (srl (and Y, mask1), lsb), msb, lsb
+define i32 @pat3(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi $a1, $a1, 288
+; CHECK-NEXT:    srli.w $a1, $a1, 4
+; CHECK-NEXT:    bstrins.w $a0, $a1, 11, 4
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i32 %a, -4081 ; 0xfffff00f
+  %and2 = and i32 %b, 288   ; 0x00000120
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+define i32 @pat3_swap(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat3_swap:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi $a1, $a1, 288
+; CHECK-NEXT:    srli.w $a1, $a1, 4
+; CHECK-NEXT:    bstrins.w $a0, $a1, 11, 4
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i32 %a, -4081 ; 0xfffff00f
+  %and2 = and i32 %b, 288   ; 0x00000120
+  %or = or i32 %and2, %and1
+  ret i32 %or
+}
+
+define i32 @pat3_positive_mask0(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat3_positive_mask0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srli.w $a1, $a1, 28
+; CHECK-NEXT:    bstrins.w $a0, $a1, 31, 28
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and1 = and i32 %a, 268435455  ; 0x0fffffff
+  %and2 = and i32 %b, 4026531840 ; 0xf0000000
+  %or = or i32 %and1, %and2
+  ret i32 %or
+}
+
+;; Pattern 4
+;; R = or (and X, mask), (shl Y, shamt)
+;; =>
+;; R = BSTRINS X, Y, 31, shamt
+define i32 @pat4(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.w $a0, $a1, 31, 28
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i32 %a, 268435455 ; 0x0fffffff
+  %shl = shl i32 %b, 28
+  %or = or i32 %and, %shl
+  ret i32 %or
+}
+
+define i32 @pat4_swap(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: pat4_swap:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrins.w $a0, $a1, 31, 28
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i32 %a, 268435455 ; 0x0fffffff
+  %shl = shl i32 %b, 28
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+;; Pattern 5
+;; R = or (and X, mask), const
+;; =>
+;; R = BSTRINS X, (const >> lsb), msb, lsb
+define i32 @pat5(i32 %a) nounwind {
+; CHECK-LABEL: pat5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 1
+; CHECK-NEXT:    ori $a1, $a1, 564
+; CHECK-NEXT:    bstrins.w $a0, $a1, 23, 8
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i32 %a, 4278190335 ; 0xff0000ff
+  %or = or i32 %and, 1192960    ; 0x00123400
+  ret i32 %or
+}
+
+;; Pattern 6: a = b | ((c & mask) << shamt)
+;; In this testcase b is 0x10000002, but in fact we do not require b being a
+;; constant. As long as all positions in b to be overwritten by the incoming
+;; bits are known to be zero, the pattern could be matched.
+define i32 @pat6(i32 %c) nounwind {
+; CHECK-LABEL: pat6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 65536
+; CHECK-NEXT:    ori $a1, $a1, 2
+; CHECK-NEXT:    bstrins.w $a1, $a0, 27, 4
+; CHECK-NEXT:    move $a0, $a1
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i32 %c, 16777215  ; 0x00ffffff
+  %shl = shl i32 %and, 4
+  %or = or i32 %shl, 268435458 ; 0x10000002
+  ret i32 %or
+}
+
+;; Pattern 7: a = b | ((c << shamt) & shifted_mask)
+;; Similar to pattern 6.
+define i32 @pat7(i32 %c) nounwind {
+; CHECK-LABEL: pat7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 65536
+; CHECK-NEXT:    ori $a1, $a1, 2
+; CHECK-NEXT:    bstrins.w $a1, $a0, 27, 4
+; CHECK-NEXT:    move $a0, $a1
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %shl = shl i32 %c, 4
+  %and = and i32 %shl, 268435440 ; 0x0ffffff0
+  %or = or i32 %and, 268435458   ; 0x10000002
+  ret i32 %or
+}
+
+;; Pattern 8: a = b | (c & shifted_mask)
+;; Similar to pattern 7 but without shift to c.
+define i32 @pat8(i32 %c) nounwind {
+; CHECK-LABEL: pat8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srli.w $a1, $a0, 4
+; CHECK-NEXT:    lu12i.w $a0, 65536
+; CHECK-NEXT:    ori $a0, $a0, 2
+; CHECK-NEXT:    bstrins.w $a0, $a1, 27, 4
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i32 %c, 268435440 ; 0x0ffffff0
+  %or = or i32 %and, 268435458 ; 0x10000002
+  ret i32 %or
+}
+
+;; Test that bstrins.w is not generated because constant OR operand
+;; doesn't fit into bits cleared by constant AND operand.
+define i32 @no_bstrins_w(i32 %a) nounwind {
+; CHECK-LABEL: no_bstrins_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 291
+; CHECK-NEXT:    ori $a1, $a1, 1104
+; CHECK-NEXT:    or $a0, $a0, $a1
+; CHECK-NEXT:    lu12i.w $a1, -3805
+; CHECK-NEXT:    ori $a1, $a1, 1279
+; CHECK-NEXT:    and $a0, $a0, $a1
+; CHECK-NEXT:    jirl $zero, $ra, 0
+  %and = and i32 %a, 4278190335 ; 0xff0000ff
+  %or = or i32 %and, 1193040    ; 0x00123450
+  ret i32 %or
+}

diff  --git a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
index 968a701660c06..33f6dbee748ef 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
@@ -250,9 +250,8 @@ define double @convert_u32_to_double(i32 %a) nounwind {
 ; LA64-NEXT:    addi.d $a1, $a1, .LCPI12_0
 ; LA64-NEXT:    fld.d $fa1, $a1, 0
 ; LA64-NEXT:    fsub.d $fa0, $fa0, $fa1
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    lu52i.d $a1, $zero, 1075
-; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 275200
+; LA64-NEXT:    bstrins.d $a0, $a1, 63, 32
 ; LA64-NEXT:    movgr2fr.d $fa1, $a0
 ; LA64-NEXT:    fadd.d $fa0, $fa1, $fa0
 ; LA64-NEXT:    jirl $zero, $ra, 0
@@ -280,9 +279,8 @@ define double @convert_u64_to_double(i64 %a) nounwind {
 ; LA64-NEXT:    addi.d $a1, $a1, .LCPI13_0
 ; LA64-NEXT:    fld.d $fa1, $a1, 0
 ; LA64-NEXT:    fsub.d $fa0, $fa0, $fa1
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    lu52i.d $a1, $zero, 1075
-; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    lu12i.w $a1, 275200
+; LA64-NEXT:    bstrins.d $a0, $a1, 63, 32
 ; LA64-NEXT:    movgr2fr.d $fa1, $a0
 ; LA64-NEXT:    fadd.d $fa0, $fa1, $fa0
 ; LA64-NEXT:    jirl $zero, $ra, 0