[llvm] f62d8a1 - [AArch64] Compare BFI and ORR with left-shifted operand for OR instruction selection.
Mingming Liu via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 3 12:32:25 PDT 2022
Author: Mingming Liu
Date: 2022-11-03T12:32:08-07:00
New Revision: f62d8a1a5044df7b8d72033d056375b4ab256012
URL: https://github.com/llvm/llvm-project/commit/f62d8a1a5044df7b8d72033d056375b4ab256012
DIFF: https://github.com/llvm/llvm-project/commit/f62d8a1a5044df7b8d72033d056375b4ab256012.diff
LOG: [AArch64] Compare BFI and ORR with left-shifted operand for OR instruction selection.
Before this patch:
- For `r = or op0, op1`, `tryBitfieldInsertOpFromOr` combines it to BFI when
1) one of the two operands is bit-field-positioning or bit-field-extraction op; and
2) bits from the two operands don't overlap
After this patch:
- Right before OR is combined to BFI, evaluates if ORR with left-shifted operand is better.
A motivating example (https://godbolt.org/z/rnMrzs5vn, which is added as a test case in `test_orr_not_bfi` in `CodeGen/AArch64/bitfield-insert.ll`)
For IR:
```
define i64 @test_orr_not_bfxil(i64 %0) {
%2 = and i64 %0, 127
%3 = lshr i64 %0, 1
%4 = and i64 %3, 16256
%5 = or i64 %4, %2
ret i64 %5
}
```
Before:
```
lsr x8, x0, #1
and x8, x8, #0x3f80
bfxil x8, x0, #0, #7
```
After:
```
ubfx x8, x0, #8, #7
and x9, x0, #0x7f
orr x0, x9, x8, lsl #7
```
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D135102
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
llvm/test/CodeGen/AArch64/arm64-strict-align.ll
llvm/test/CodeGen/AArch64/arm64_32.ll
llvm/test/CodeGen/AArch64/bfis-in-loop.ll
llvm/test/CodeGen/AArch64/bitfield-insert.ll
llvm/test/CodeGen/AArch64/build-pair-isel.ll
llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
llvm/test/CodeGen/AArch64/load-combine.ll
llvm/test/CodeGen/AArch64/logic-shift.ll
llvm/test/CodeGen/AArch64/nontemporal-load.ll
llvm/test/CodeGen/AArch64/rotate-extract.ll
llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
llvm/test/CodeGen/AArch64/urem-seteq.ll
llvm/test/CodeGen/AArch64/vec_uaddo.ll
llvm/test/CodeGen/AArch64/vec_umulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 844f9c0c7159..de44144dc25b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2803,6 +2803,122 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
return true;
}
+static bool isWorthFoldingIntoOrrWithLeftShift(SDValue Dst,
+ SelectionDAG *CurDAG,
+ SDValue &LeftShiftedOperand,
+ uint64_t &LeftShiftAmount) {
+ // Avoid folding Dst into ORR-with-left-shift if Dst has other uses than ORR.
+ if (!Dst.hasOneUse())
+ return false;
+
+ EVT VT = Dst.getValueType();
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "Caller should guarantee that VT is one of i32 or i64");
+ const unsigned SizeInBits = VT.getSizeInBits();
+
+ SDLoc DL(Dst.getNode());
+ uint64_t AndImm, ShlImm;
+ if (isOpcWithIntImmediate(Dst.getNode(), ISD::AND, AndImm) &&
+ isShiftedMask_64(AndImm)) {
+ // Avoid transforming 'DstOp0' if it has other uses than the AND node.
+ SDValue DstOp0 = Dst.getOperand(0);
+ if (!DstOp0.hasOneUse())
+ return false;
+
+ // An example to illustrate the transformation
+ // From:
+ // lsr x8, x1, #1
+ // and x8, x8, #0x3f80
+ // bfxil x8, x1, #0, #7
+ // To:
+ // and x8, x23, #0x7f
+ // ubfx x9, x23, #8, #7
+ // orr x23, x8, x9, lsl #7
+ //
+ // The number of instructions remains the same, but ORR is faster than BFXIL
+ // on many AArch64 processors (or as good as BFXIL if not faster). Besides,
+ // the dependency chain is improved after the transformation.
+ uint64_t SrlImm;
+ if (isOpcWithIntImmediate(DstOp0.getNode(), ISD::SRL, SrlImm)) {
+ uint64_t NumTrailingZeroInShiftedMask = countTrailingZeros(AndImm);
+ if ((SrlImm + NumTrailingZeroInShiftedMask) < SizeInBits) {
+ unsigned MaskWidth =
+ countTrailingOnes(AndImm >> NumTrailingZeroInShiftedMask);
+ unsigned UBFMOpc =
+ (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+ SDNode *UBFMNode = CurDAG->getMachineNode(
+ UBFMOpc, DL, VT, DstOp0.getOperand(0),
+ CurDAG->getTargetConstant(SrlImm + NumTrailingZeroInShiftedMask, DL,
+ VT),
+ CurDAG->getTargetConstant(
+ SrlImm + NumTrailingZeroInShiftedMask + MaskWidth - 1, DL, VT));
+ LeftShiftedOperand = SDValue(UBFMNode, 0);
+ LeftShiftAmount = NumTrailingZeroInShiftedMask;
+ return true;
+ }
+ }
+ } else if (isOpcWithIntImmediate(Dst.getNode(), ISD::SHL, ShlImm)) {
+ LeftShiftedOperand = Dst.getOperand(0);
+ LeftShiftAmount = ShlImm;
+ return true;
+ }
+ // FIXME: Extend the implementation to optimize if Dst is an SRL node.
+ return false;
+}
+
+static bool tryOrrWithLeftShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
+ SDValue Src, SDValue Dst, SelectionDAG *CurDAG,
+ const bool BiggerPattern) {
+ EVT VT = N->getValueType(0);
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "Expect result type to be i32 or i64 since N is combinable to BFM");
+ SDLoc DL(N);
+
+ // Bail out if BFM simplifies away one node in BFM Dst.
+ if (OrOpd1 != Dst)
+ return false;
+
+ // For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer
+ // nodes from Rn (or inserts additional shift node) if BiggerPattern is true.
+ if (BiggerPattern) {
+ uint64_t SrcAndImm;
+ if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::AND, SrcAndImm) &&
+ isMask_64(SrcAndImm) && OrOpd0.getOperand(0) == Src) {
+ // OrOpd0 = AND Src, #Mask
+ // So BFM simplifies away one AND node from Src and doesn't simplify away
+ // nodes from Dst. If ORR with left-shifted operand also simplifies away
+ // one node (from Rd), ORR is better since it has higher throughput and
+ // smaller latency than BFM on many AArch64 processors (and for the rest
+ // ORR is at least as good as BFM).
+ SDValue LeftShiftedOperand;
+ uint64_t LeftShiftAmount;
+ if (isWorthFoldingIntoOrrWithLeftShift(Dst, CurDAG, LeftShiftedOperand,
+ LeftShiftAmount)) {
+ unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
+ SDValue Ops[] = {OrOpd0, LeftShiftedOperand,
+ CurDAG->getTargetConstant(LeftShiftAmount, DL, VT)};
+ CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ assert((!BiggerPattern) && "BiggerPattern should be handled above");
+
+ uint64_t ShlImm;
+ // FIXME: Extend the implementation if OrOpd0 is an SRL node.
+ if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) &&
+ OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
+ unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
+ SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ShlImm, DL, VT)};
+ CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+ return true;
+ }
+
+ return false;
+}
+
static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
SelectionDAG *CurDAG) {
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
@@ -2905,6 +3021,14 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
// or is useful because it discards more bits
Dst = OrOpd1Val;
+ // Before selecting ISD::OR node to AArch64::BFM, see if an AArch64::ORR
+ // with left-shifted operand is more efficient.
+ // FIXME: Extend this to compare AArch64::BFM and AArch64::ORR with
+ // right-shifted operand as well.
+ if (tryOrrWithLeftShift(N, OrOpd0Val, OrOpd1Val, Src, Dst, CurDAG,
+ BiggerPattern))
+ return true;
+
// both parts match
SDLoc DL(N);
SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
diff --git a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
index 86ef69ff4e93..cf72e4b1fce9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -964,9 +964,9 @@ entry:
define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
; LLC-LABEL: test_ignored_rightbits:
; LLC: // %bb.0:
-; LLC-NEXT: and w0, w0, #0x7
-; LLC-NEXT: bfi w0, w1, #3, #4
-; LLC-NEXT: bfi w0, w0, #8, #7
+; LLC-NEXT: and w8, w0, #0x7
+; LLC-NEXT: bfi w8, w1, #3, #4
+; LLC-NEXT: orr w0, w8, w8, lsl #8
; LLC-NEXT: ret
; OPT-LABEL: @test_ignored_rightbits(
; OPT-NEXT: [[POSITIONED_FIELD:%.*]] = shl i32 [[IN:%.*]], 3
@@ -1000,8 +1000,8 @@ define void @sameOperandBFI(i64 %src, i64 %src2, i16 *%ptr) {
; LLC-NEXT: lsr x8, x0, #47
; LLC-NEXT: and w9, w1, #0x3
; LLC-NEXT: bfi w9, w8, #2, #2
-; LLC-NEXT: bfi w9, w9, #4, #4
-; LLC-NEXT: strh w9, [x2]
+; LLC-NEXT: orr w8, w9, w9, lsl #4
+; LLC-NEXT: strh w8, [x2]
; LLC-NEXT: .LBB30_2: // %end
; LLC-NEXT: ret
; OPT-LABEL: @sameOperandBFI(
diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
index eca81e58004c..5a44550cc172 100644
--- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
@@ -5,8 +5,8 @@ define i24 @ldi24(ptr %p) nounwind {
; CHECK-LABEL: ldi24:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrh w0, [x0]
-; CHECK-NEXT: bfi w0, w8, #16, #16
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: orr w0, w9, w8, lsl #16
; CHECK-NEXT: ret
%r = load i24, i24* %p
ret i24 %r
@@ -17,9 +17,9 @@ define i56 @ldi56(ptr %p) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #6]
; CHECK-NEXT: ldrh w9, [x0, #4]
-; CHECK-NEXT: ldr w0, [x0]
-; CHECK-NEXT: bfi w9, w8, #16, #16
-; CHECK-NEXT: bfi x0, x9, #32, #32
+; CHECK-NEXT: ldr w10, [x0]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: orr x0, x10, x8, lsl #32
; CHECK-NEXT: ret
%r = load i56, i56* %p
ret i56 %r
@@ -41,10 +41,10 @@ define i120 @ldi120(ptr %p) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #14]
; CHECK-NEXT: ldrh w9, [x0, #12]
-; CHECK-NEXT: ldr w1, [x0, #8]
+; CHECK-NEXT: ldr w10, [x0, #8]
; CHECK-NEXT: ldr x0, [x0]
-; CHECK-NEXT: bfi w9, w8, #16, #16
-; CHECK-NEXT: bfi x1, x9, #32, #32
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: orr x1, x10, x8, lsl #32
; CHECK-NEXT: ret
%r = load i120, i120* %p
ret i120 %r
@@ -55,10 +55,10 @@ define i280 @ldi280(ptr %p) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp x8, x1, [x0]
; CHECK-NEXT: ldrb w9, [x0, #34]
-; CHECK-NEXT: ldrh w4, [x0, #32]
+; CHECK-NEXT: ldrh w10, [x0, #32]
; CHECK-NEXT: ldp x2, x3, [x0, #16]
; CHECK-NEXT: mov x0, x8
-; CHECK-NEXT: bfi x4, x9, #16, #8
+; CHECK-NEXT: orr x4, x10, x9, lsl #16
; CHECK-NEXT: ret
%r = load i280, i280* %p
ret i280 %r
@@ -133,7 +133,7 @@ define void @i56_or(ptr %a) {
; CHECK-NEXT: ldrh w10, [x8, #4]!
; CHECK-NEXT: ldrb w11, [x8, #2]
; CHECK-NEXT: orr w9, w9, #0x180
-; CHECK-NEXT: bfi w10, w11, #16, #16
+; CHECK-NEXT: orr w10, w10, w11, lsl #16
; CHECK-NEXT: str w9, [x0]
; CHECK-NEXT: strb w11, [x8, #2]
; CHECK-NEXT: strh w10, [x8]
@@ -153,7 +153,7 @@ define void @i56_and_or(ptr %a) {
; CHECK-NEXT: ldrb w11, [x8, #2]
; CHECK-NEXT: orr w9, w9, #0x180
; CHECK-NEXT: and w9, w9, #0xffffff80
-; CHECK-NEXT: bfi w10, w11, #16, #16
+; CHECK-NEXT: orr w10, w10, w11, lsl #16
; CHECK-NEXT: strb w11, [x8, #2]
; CHECK-NEXT: str w9, [x0]
; CHECK-NEXT: strh w10, [x8]
@@ -172,11 +172,11 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
; CHECK-NEXT: ldr w11, [x0]
; CHECK-NEXT: ldrh w9, [x8, #4]!
; CHECK-NEXT: ldrb w10, [x8, #2]
-; CHECK-NEXT: bfi w9, w10, #16, #8
+; CHECK-NEXT: orr w9, w9, w10, lsl #16
; CHECK-NEXT: strb w10, [x8, #2]
-; CHECK-NEXT: bfi x11, x9, #32, #24
-; CHECK-NEXT: strh w9, [x8]
+; CHECK-NEXT: orr x11, x11, x9, lsl #32
; CHECK-NEXT: and x11, x11, #0xffffffffffffdfff
+; CHECK-NEXT: strh w9, [x8]
; CHECK-NEXT: orr w11, w11, w1, lsl #13
; CHECK-NEXT: str w11, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-strict-align.ll b/llvm/test/CodeGen/AArch64/arm64-strict-align.ll
index 28c158f7a2eb..a7450349766f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-strict-align.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -5,7 +5,7 @@
define i32 @f0(i32* nocapture %p) nounwind {
; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
-; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
+; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
; CHECK-STRICT: ret
; CHECK: ldr w0, [x0]
@@ -16,7 +16,7 @@ define i32 @f0(i32* nocapture %p) nounwind {
define i64 @f1(i64* nocapture %p) nounwind {
; CHECK-STRICT: ldp w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
-; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
+; CHECK-STRICT: orr x0, x[[LOW]], x[[HIGH]], lsl #32
; CHECK-STRICT: ret
; CHECK: ldr x0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/arm64_32.ll b/llvm/test/CodeGen/AArch64/arm64_32.ll
index fbf12e80b6b5..0eb5b637b08f 100644
--- a/llvm/test/CodeGen/AArch64/arm64_32.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32.ll
@@ -662,8 +662,9 @@ define void @test_struct_hi(i32 %hi) nounwind {
; CHECK-LABEL: test_struct_hi:
; CHECK: mov w[[IN:[0-9]+]], w0
; CHECK: bl _get_int
-; CHECK-FAST-NEXT: mov w0, w0
-; CHECK-NEXT: bfi x0, x[[IN]], #32, #32
+; CHECK-FAST-NEXT: mov w[[DST:[0-9]+]], w0
+; CHECK-FAST-NEXT: orr x0, x[[DST]], x[[IN]], lsl #32
+; CHECK-OPT-NEXT: bfi x0, x[[IN]], #32, #32
; CHECK-NEXT: bl _take_pair
%val.64 = call i64 @get_int()
%val.32 = trunc i64 %val.64 to i32
diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
index 6ee2feb2c217..5207f2ba32d3 100644
--- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
+++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
@@ -28,8 +28,8 @@ define i64 @bfis_in_loop_zero() {
; CHECK-NEXT: ldr x11, [x9, #8]
; CHECK-NEXT: and x9, x10, #0xff
; CHECK-NEXT: and x10, x0, #0xffffffff00000000
-; CHECK-NEXT: bfi x9, x8, #8, #32
-; CHECK-NEXT: bfi x10, x12, #16, #1
+; CHECK-NEXT: orr x9, x9, x8, lsl #8
+; CHECK-NEXT: orr x10, x10, x12, lsl #16
; CHECK-NEXT: orr x0, x10, x9
; CHECK-NEXT: ldr x9, [x11, #16]
; CHECK-NEXT: cbnz x11, .LBB0_1
@@ -97,8 +97,8 @@ define i64 @bfis_in_loop_undef() {
; CHECK-NEXT: ldr x11, [x9, #8]
; CHECK-NEXT: and x9, x10, #0xff
; CHECK-NEXT: and x10, x0, #0xffffffff00000000
-; CHECK-NEXT: bfi x9, x8, #8, #32
-; CHECK-NEXT: bfi x10, x12, #16, #1
+; CHECK-NEXT: orr x9, x9, x8, lsl #8
+; CHECK-NEXT: orr x10, x10, x12, lsl #16
; CHECK-NEXT: orr x0, x10, x9
; CHECK-NEXT: ldr x9, [x11, #16]
; CHECK-NEXT: cbnz x11, .LBB1_1
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index 3bd320dc40b5..b8e69d5cfaaf 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -269,8 +269,7 @@ define i32 @test_nouseful_bits(i8 %a, i32 %b) {
; CHECK-NEXT: lsl w8, w8, #8
; CHECK-NEXT: mov w9, w8
; CHECK-NEXT: bfxil w9, w0, #0, #8
-; CHECK-NEXT: bfi w8, w9, #16, #16
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: orr w0, w8, w9, lsl #16
; CHECK-NEXT: ret
%conv = zext i8 %a to i32 ; 0 0 0 A
%shl = shl i32 %b, 8 ; B2 B1 B0 0
@@ -612,10 +611,9 @@ define i64 @test_and_extended_shift_with_imm(i64 %0) {
define i64 @test_orr_not_bfxil_i64(i64 %0) {
; CHECK-LABEL: test_orr_not_bfxil_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #1
-; CHECK-NEXT: and x8, x8, #0x3f80
-; CHECK-NEXT: bfxil x8, x0, #0, #7
-; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ubfx x8, x0, #8, #7
+; CHECK-NEXT: and x9, x0, #0x7f
+; CHECK-NEXT: orr x0, x9, x8, lsl #7
; CHECK-NEXT: ret
%2 = and i64 %0, 127
%3 = lshr i64 %0, 1
@@ -628,10 +626,9 @@ define i64 @test_orr_not_bfxil_i64(i64 %0) {
define i32 @test_orr_not_bfxil_i32(i32 %0) {
; CHECK-LABEL: test_orr_not_bfxil_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsr w8, w0, #1
-; CHECK-NEXT: and w8, w8, #0x3f80
-; CHECK-NEXT: bfxil w8, w0, #0, #7
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: ubfx w8, w0, #8, #7
+; CHECK-NEXT: and w9, w0, #0x7f
+; CHECK-NEXT: orr w0, w9, w8, lsl #7
; CHECK-NEXT: ret
%2 = and i32 %0, 127
%3 = lshr i32 %0, 1
diff --git a/llvm/test/CodeGen/AArch64/build-pair-isel.ll b/llvm/test/CodeGen/AArch64/build-pair-isel.ll
index b9f03ed872cf..970a2c69343f 100644
--- a/llvm/test/CodeGen/AArch64/build-pair-isel.ll
+++ b/llvm/test/CodeGen/AArch64/build-pair-isel.ll
@@ -14,7 +14,7 @@ define void @compare_and_swap128() {
; CHECK-NEXT: mov w9, w10
; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: // kill: def $x8 killed $w8
-; CHECK-NEXT: bfi x8, x9, #32, #32
+; CHECK-NEXT: orr x8, x8, x9, lsl #32
; CHECK-NEXT: // implicit-def: $x9
; CHECK-NEXT: str x8, [x9]
; CHECK-NEXT: ret
@@ -22,5 +22,3 @@ define void @compare_and_swap128() {
store i128 %1, i128* undef, align 16
ret void
}
-
-
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
index bb37cc81a7ab..c4481871dec4 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
@@ -19,8 +19,7 @@ define i8 @rotl_i8_const_shift(i8 %x) {
; CHECK-LABEL: rotl_i8_const_shift:
; CHECK: // %bb.0:
; CHECK-NEXT: ubfx w8, w0, #5, #3
-; CHECK-NEXT: bfi w8, w0, #3, #29
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: orr w0, w8, w0, lsl #3
; CHECK-NEXT: ret
%f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
ret i8 %f
diff --git a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
index 43e04e341b7e..bff4f2113df3 100644
--- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
@@ -463,8 +463,8 @@ define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0]
; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: lsl w0, w8, #8
-; CHECK-NEXT: bfi w0, w9, #16, #8
+; CHECK-NEXT: lsl w8, w8, #8
+; CHECK-NEXT: orr w0, w8, w9, lsl #16
; CHECK-NEXT: ret
%tmp = bitcast i32* %arg to i8*
%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
@@ -486,8 +486,8 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0]
; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: lsl w0, w8, #16
-; CHECK-NEXT: bfi w0, w9, #24, #8
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: orr w0, w8, w9, lsl #24
; CHECK-NEXT: ret
%tmp = bitcast i32* %arg to i8*
%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
@@ -527,8 +527,8 @@ define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #1]
; CHECK-NEXT: ldrb w9, [x0]
-; CHECK-NEXT: lsl w0, w8, #8
-; CHECK-NEXT: bfi w0, w9, #16, #8
+; CHECK-NEXT: lsl w8, w8, #8
+; CHECK-NEXT: orr w0, w8, w9, lsl #16
; CHECK-NEXT: ret
%tmp = bitcast i32* %arg to i8*
%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
@@ -550,8 +550,8 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #1]
; CHECK-NEXT: ldrb w9, [x0]
-; CHECK-NEXT: lsl w0, w8, #16
-; CHECK-NEXT: bfi w0, w9, #24, #8
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: orr w0, w8, w9, lsl #24
; CHECK-NEXT: ret
%tmp = bitcast i32* %arg to i8*
%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
@@ -576,8 +576,8 @@ define i16 @load_i16_from_nonzero_offset(i8* %p) {
; CHECK-LABEL: load_i16_from_nonzero_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: ldrb w0, [x0, #2]
-; CHECK-NEXT: bfi w0, w8, #8, #24
+; CHECK-NEXT: ldrb w9, [x0, #2]
+; CHECK-NEXT: orr w0, w9, w8, lsl #8
; CHECK-NEXT: ret
%p1.i16 = bitcast i8* %p to i16*
%p2.i8 = getelementptr i8, i8* %p, i64 2
diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll
index 293967bcec75..de1b0f13adf0 100644
--- a/llvm/test/CodeGen/AArch64/load-combine.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine.ll
@@ -453,8 +453,8 @@ define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0]
; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: lsl w0, w8, #8
-; CHECK-NEXT: bfi w0, w9, #16, #8
+; CHECK-NEXT: lsl w8, w8, #8
+; CHECK-NEXT: orr w0, w8, w9, lsl #16
; CHECK-NEXT: ret
%tmp = bitcast i32* %arg to i8*
@@ -477,8 +477,8 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0]
; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: lsl w0, w8, #16
-; CHECK-NEXT: bfi w0, w9, #24, #8
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: orr w0, w8, w9, lsl #24
; CHECK-NEXT: ret
%tmp = bitcast i32* %arg to i8*
@@ -521,8 +521,8 @@ define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #1]
; CHECK-NEXT: ldrb w9, [x0]
-; CHECK-NEXT: lsl w0, w8, #8
-; CHECK-NEXT: bfi w0, w9, #16, #8
+; CHECK-NEXT: lsl w8, w8, #8
+; CHECK-NEXT: orr w0, w8, w9, lsl #16
; CHECK-NEXT: ret
%tmp = bitcast i32* %arg to i8*
@@ -545,8 +545,8 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #1]
; CHECK-NEXT: ldrb w9, [x0]
-; CHECK-NEXT: lsl w0, w8, #16
-; CHECK-NEXT: bfi w0, w9, #24, #8
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: orr w0, w8, w9, lsl #24
; CHECK-NEXT: ret
%tmp = bitcast i32* %arg to i8*
@@ -603,7 +603,7 @@ define void @short_vector_to_i32_unused_low_i8(<4 x i8>* %in, i32* %out, i32* %p
; CHECK-NEXT: umov w10, v0.h[3]
; CHECK-NEXT: lsl w8, w8, #16
; CHECK-NEXT: bfi w8, w9, #8, #8
-; CHECK-NEXT: bfi w8, w10, #24, #8
+; CHECK-NEXT: orr w8, w8, w10, lsl #24
; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
%ld = load <4 x i8>, <4 x i8>* %in, align 4
@@ -634,8 +634,8 @@ define void @short_vector_to_i32_unused_high_i8(<4 x i8>* %in, i32* %out, i32* %
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: umov w8, v0.h[2]
-; CHECK-NEXT: bfi w9, w8, #16, #8
-; CHECK-NEXT: str w9, [x1]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
%ld = load <4 x i8>, <4 x i8>* %in, align 4
@@ -665,7 +665,7 @@ define void @short_vector_to_i32_unused_low_i16(<4 x i8>* %in, i32* %out, i32* %
; CHECK-NEXT: umov w8, v0.h[3]
; CHECK-NEXT: umov w9, v0.h[2]
; CHECK-NEXT: lsl w8, w8, #24
-; CHECK-NEXT: bfi w8, w9, #16, #8
+; CHECK-NEXT: orr w8, w8, w9, lsl #16
; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
%ld = load <4 x i8>, <4 x i8>* %in, align 4
diff --git a/llvm/test/CodeGen/AArch64/logic-shift.ll b/llvm/test/CodeGen/AArch64/logic-shift.ll
index 12c3e18317f8..ba63c4433a2a 100644
--- a/llvm/test/CodeGen/AArch64/logic-shift.ll
+++ b/llvm/test/CodeGen/AArch64/logic-shift.ll
@@ -818,8 +818,7 @@ define i32 @or_fshr_wrong_shift(i32 %x, i32 %y) {
; CHECK: // %bb.0:
; CHECK-NEXT: orr w8, w0, w1
; CHECK-NEXT: lsr w8, w8, #26
-; CHECK-NEXT: bfi w8, w0, #7, #25
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: orr w0, w8, w0, lsl #7
; CHECK-NEXT: ret
%or1 = or i32 %x, %y
%sh1 = shl i32 %x, 7
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index 288ba22e7928..f8ff50b6e4c5 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -490,27 +490,27 @@ define <4 x i65> @test_ldnp_v4i65(<4 x i65>* %A) {
;
; CHECK-BE-LABEL: test_ldnp_v4i65:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldp x9, x8, [x0, #16]
-; CHECK-BE-NEXT: ldp x11, x10, [x0]
-; CHECK-BE-NEXT: ldrb w7, [x0, #32]
-; CHECK-BE-NEXT: lsr x13, x9, #56
-; CHECK-BE-NEXT: lsr x14, x11, #56
-; CHECK-BE-NEXT: extr x15, x10, x9, #56
-; CHECK-BE-NEXT: bfi x7, x8, #8, #56
-; CHECK-BE-NEXT: extr x8, x9, x8, #56
-; CHECK-BE-NEXT: extr x12, x11, x10, #56
-; CHECK-BE-NEXT: lsr x11, x11, #59
-; CHECK-BE-NEXT: ubfx x9, x9, #57, #1
+; CHECK-BE-NEXT: ldp x10, x9, [x0, #16]
+; CHECK-BE-NEXT: ldp x12, x11, [x0]
+; CHECK-BE-NEXT: ldrb w8, [x0, #32]
+; CHECK-BE-NEXT: lsr x13, x10, #56
+; CHECK-BE-NEXT: lsr x14, x12, #56
+; CHECK-BE-NEXT: extr x15, x11, x10, #56
+; CHECK-BE-NEXT: orr x7, x8, x9, lsl #8
+; CHECK-BE-NEXT: extr x8, x10, x9, #56
+; CHECK-BE-NEXT: extr x9, x12, x11, #56
+; CHECK-BE-NEXT: lsr x12, x12, #59
+; CHECK-BE-NEXT: ubfx x10, x10, #57, #1
; CHECK-BE-NEXT: extr x5, x13, x8, #1
-; CHECK-BE-NEXT: extr x1, x14, x12, #3
-; CHECK-BE-NEXT: ubfx x12, x10, #58, #1
-; CHECK-BE-NEXT: fmov d0, x11
-; CHECK-BE-NEXT: and x11, x8, #0x1
-; CHECK-BE-NEXT: lsr x10, x10, #56
-; CHECK-BE-NEXT: fmov d2, x9
-; CHECK-BE-NEXT: fmov d1, x12
-; CHECK-BE-NEXT: extr x3, x10, x15, #2
-; CHECK-BE-NEXT: fmov d3, x11
+; CHECK-BE-NEXT: extr x1, x14, x9, #3
+; CHECK-BE-NEXT: ubfx x9, x11, #58, #1
+; CHECK-BE-NEXT: fmov d0, x12
+; CHECK-BE-NEXT: and x12, x8, #0x1
+; CHECK-BE-NEXT: lsr x11, x11, #56
+; CHECK-BE-NEXT: fmov d2, x10
+; CHECK-BE-NEXT: fmov d1, x9
+; CHECK-BE-NEXT: extr x3, x11, x15, #2
+; CHECK-BE-NEXT: fmov d3, x12
; CHECK-BE-NEXT: mov v0.d[1], x1
; CHECK-BE-NEXT: mov v2.d[1], x5
; CHECK-BE-NEXT: mov v1.d[1], x3
diff --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll
index 9a1c6a965bf7..20008c41c42e 100644
--- a/llvm/test/CodeGen/AArch64/rotate-extract.ll
+++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll
@@ -113,8 +113,8 @@ define i64 @no_extract_mul(i64 %i) nounwind {
; CHECK-LABEL: no_extract_mul:
; CHECK: // %bb.0:
; CHECK-NEXT: add x8, x0, x0, lsl #3
-; CHECK-NEXT: lsr x0, x8, #57
-; CHECK-NEXT: bfi x0, x8, #8, #56
+; CHECK-NEXT: lsr x9, x8, #57
+; CHECK-NEXT: orr x0, x9, x8, lsl #8
; CHECK-NEXT: ret
%lhs_mul = mul i64 %i, 2304
%rhs_mul = mul i64 %i, 9
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 85f20ba18cc1..7b5041fc58cc 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -391,20 +391,20 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: fmov s0, w10
; CHECK-NEXT: ubfx x12, x9, #12, #20
; CHECK-NEXT: lsr x15, x9, #31
-; CHECK-NEXT: bfi w13, w11, #16, #8
-; CHECK-NEXT: lsr x11, x9, #50
+; CHECK-NEXT: orr w11, w13, w11, lsl #16
+; CHECK-NEXT: lsr x13, x9, #50
; CHECK-NEXT: mov.s v0[1], w14
; CHECK-NEXT: fmov s1, w12
; CHECK-NEXT: lsr x12, x10, #38
-; CHECK-NEXT: bfi w11, w13, #14, #18
+; CHECK-NEXT: orr w13, w13, w11, lsl #14
; CHECK-NEXT: lsr x10, x10, #57
-; CHECK-NEXT: bfi w10, w9, #7, #25
-; CHECK-NEXT: lsr w9, w13, #5
+; CHECK-NEXT: orr w9, w10, w9, lsl #7
+; CHECK-NEXT: lsr w10, w11, #5
; CHECK-NEXT: mov.s v1[1], w15
; CHECK-NEXT: mov.s v0[2], w12
-; CHECK-NEXT: mov.s v1[2], w11
-; CHECK-NEXT: mov.s v0[3], w10
-; CHECK-NEXT: mov.s v1[3], w9
+; CHECK-NEXT: mov.s v1[2], w13
+; CHECK-NEXT: mov.s v0[3], w9
+; CHECK-NEXT: mov.s v1[3], w10
; CHECK-NEXT: uzp1.8h v0, v0, v1
; CHECK-NEXT: xtn.8b v0, v0
; CHECK-NEXT: str d0, [x1, x8, lsl #3]
@@ -420,21 +420,21 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-BE-NEXT: .LBB5_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ldp x10, x9, [x0]
-; CHECK-BE-NEXT: ldrh w15, [x0, #16]
-; CHECK-BE-NEXT: lsr x12, x10, #40
+; CHECK-BE-NEXT: ldrh w11, [x0, #16]
; CHECK-BE-NEXT: lsr x13, x10, #45
-; CHECK-BE-NEXT: lsr x11, x9, #40
+; CHECK-BE-NEXT: lsr x15, x10, #40
+; CHECK-BE-NEXT: lsr x12, x9, #40
; CHECK-BE-NEXT: ubfx x14, x9, #33, #7
; CHECK-BE-NEXT: ubfx x16, x10, #26, #14
-; CHECK-BE-NEXT: bfi w16, w12, #14, #18
-; CHECK-BE-NEXT: ubfx x12, x9, #14, #18
-; CHECK-BE-NEXT: bfi w14, w11, #7, #24
-; CHECK-BE-NEXT: ldrb w11, [x0, #18]
+; CHECK-BE-NEXT: orr w12, w14, w12, lsl #7
+; CHECK-BE-NEXT: ldrb w14, [x0, #18]
+; CHECK-BE-NEXT: orr w15, w16, w15, lsl #14
; CHECK-BE-NEXT: fmov s0, w13
; CHECK-BE-NEXT: add x0, x0, #32
-; CHECK-BE-NEXT: fmov s1, w14
-; CHECK-BE-NEXT: bfi w11, w15, #8, #16
-; CHECK-BE-NEXT: mov v0.s[1], w16
+; CHECK-BE-NEXT: fmov s1, w12
+; CHECK-BE-NEXT: ubfx x12, x9, #14, #18
+; CHECK-BE-NEXT: orr w11, w14, w11, lsl #8
+; CHECK-BE-NEXT: mov v0.s[1], w15
; CHECK-BE-NEXT: mov v1.s[1], w12
; CHECK-BE-NEXT: extr x12, x10, x9, #40
; CHECK-BE-NEXT: lsl x9, x9, #24
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll
index 9f9e3f712a62..56b030dcca52 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll
@@ -82,8 +82,8 @@ define i16 @test_urem_even(i16 %X) nounwind {
; CHECK-NEXT: mul w8, w0, w8
; CHECK-NEXT: and w9, w8, #0xfffc
; CHECK-NEXT: lsr w9, w9, #1
-; CHECK-NEXT: bfi w9, w8, #15, #17
-; CHECK-NEXT: ubfx w8, w9, #1, #15
+; CHECK-NEXT: orr w8, w9, w8, lsl #15
+; CHECK-NEXT: ubfx w8, w8, #1, #15
; CHECK-NEXT: cmp w8, #2340
; CHECK-NEXT: cset w0, hi
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index eab5c8abd020..9e73cc5195e4 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -249,17 +249,18 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: umov w9, v0.h[0]
+; CHECK-NEXT: umov w8, v0.h[0]
+; CHECK-NEXT: umov w9, v0.h[1]
; CHECK-NEXT: umov w10, v0.h[2]
; CHECK-NEXT: umov w11, v0.h[3]
; CHECK-NEXT: and v1.8b, v0.8b, v2.8b
; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: bfi w9, w8, #1, #1
-; CHECK-NEXT: bfi w9, w10, #2, #1
+; CHECK-NEXT: and w8, w8, #0x1
+; CHECK-NEXT: bfi w8, w9, #1, #1
; CHECK-NEXT: mvn v0.8b, v0.8b
-; CHECK-NEXT: bfi w9, w11, #3, #29
-; CHECK-NEXT: and w8, w9, #0xf
+; CHECK-NEXT: bfi w8, w10, #2, #1
+; CHECK-NEXT: orr w8, w8, w11, lsl #3
+; CHECK-NEXT: and w8, w8, #0xf
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index 0fccb574644f..4b61a873706a 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -299,14 +299,15 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; CHECK-NEXT: fmov d2, d0
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: and v1.8b, v2.8b, v1.8b
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: umov w9, v1.h[0]
+; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: umov w9, v1.h[1]
; CHECK-NEXT: umov w10, v1.h[2]
; CHECK-NEXT: umov w11, v1.h[3]
-; CHECK-NEXT: bfi w9, w8, #1, #1
-; CHECK-NEXT: bfi w9, w10, #2, #1
-; CHECK-NEXT: bfi w9, w11, #3, #29
-; CHECK-NEXT: and w8, w9, #0xf
+; CHECK-NEXT: and w8, w8, #0x1
+; CHECK-NEXT: bfi w8, w9, #1, #1
+; CHECK-NEXT: bfi w8, w10, #2, #1
+; CHECK-NEXT: orr w8, w8, w11, lsl #3
+; CHECK-NEXT: and w8, w8, #0xf
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
More information about the llvm-commits
mailing list