[llvm] [SelectionDAG] Use SimplifyDemandedBits from SimplifyDemandedVectorElts Bitcast. (PR #133717)

Mon Mar 31 23:56:36 PDT 2025

https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/133717

>From 21f60c12b1390bb43e7cde5cda309e13b80dbed6 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 31 Mar 2025 14:28:09 +0100
Subject: [PATCH 1/2] [SelectionDAG] Use SimplifyDemandedBits from
 SimplifyDemandedVectorElts Bitcast.

This adds a call to SimplifyDemandedBits from bitcasts with scalar input types
in SimplifyDemandedVectorElts, which can help simplify the input scalar.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  18 ++-
 llvm/test/CodeGen/AArch64/bitcast-extend.ll   |  68 ++++++---
 llvm/test/CodeGen/Thumb2/mve-vdup.ll          |   4 +-
 .../WebAssembly/simd-shuffle-bitcast.ll       |   4 +-
 llvm/test/CodeGen/X86/kmov.ll                 | 136 +++++++-----------
 .../CodeGen/X86/vector-reduce-fmax-nnan.ll    |   1 -
 6 files changed, 116 insertions(+), 115 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 10006a9d76785..c249929d35d5e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3163,10 +3163,22 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     SDValue Src = Op.getOperand(0);
     EVT SrcVT = Src.getValueType();
 
-    // We only handle vectors here.
-    // TODO - investigate calling SimplifyDemandedBits/ComputeKnownBits?
-    if (!SrcVT.isVector())
+    if (!SrcVT.isVector()) {
+      // TODO - bigendian once we have test coverage.
+      if (IsLE) {
+        APInt DemandedSrcBits = APInt::getZero(SrcVT.getSizeInBits());
+        for (unsigned i = 0; i != NumElts; ++i)
+          if (DemandedElts[i]) {
+            unsigned Offset = i * VT.getScalarSizeInBits();
+            DemandedSrcBits.insertBits(
+                APInt::getAllOnes(VT.getScalarSizeInBits()), Offset);
+          }
+        KnownBits Known;
+        if (SimplifyDemandedBits(Src, DemandedSrcBits, Known, TLO, Depth + 1))
+          return true;
+      }
       break;
+    }
 
     // Fast handling of 'identity' bitcasts.
     unsigned NumSrcElts = SrcVT.getVectorNumElements();
diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
index 195c740022d10..85daa3ca6623e 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
@@ -217,17 +217,28 @@ define <4 x i64> @s_i32_v4i64(i32 %x) {
 }
 
 define void @extractbitcastext(i32 %bytes, ptr %output) {
-; CHECK-LABEL: extractbitcastext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extractbitcastext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    stp q1, q0, [x1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extractbitcastext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    sxtw x8, w0
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    stp q1, q0, [x1]
+; CHECK-GI-NEXT:    ret
   %conv = sext i32 %bytes to i64
   %b0 = bitcast i64 %conv to <8 x i8>
   %b1 = zext <8 x i8> %b0 to <8 x i16>
@@ -244,17 +255,28 @@ define void @extractbitcastext(i32 %bytes, ptr %output) {
 }
 
 define void @extractbitcastext_s(i32 %bytes, ptr %output) {
-; CHECK-LABEL: extractbitcastext_s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    stp q1, q0, [x1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extractbitcastext_s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-SD-NEXT:    sshll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    stp q1, q0, [x1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extractbitcastext_s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    sxtw x8, w0
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    stp q1, q0, [x1]
+; CHECK-GI-NEXT:    ret
   %conv = sext i32 %bytes to i64
   %b0 = bitcast i64 %conv to <8 x i8>
   %b1 = sext <8 x i8> %b0 to <8 x i16>
@@ -271,3 +293,5 @@ define void @extractbitcastext_s(i32 %bytes, ptr %output) {
 }
 
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
index 9ba3866ad4730..77fa9f297e678 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
@@ -371,7 +371,7 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_i64_v8i16(i64 %a) {
 ; CHECK-LE:       @ %bb.0:
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    strd r0, r1, [sp]
+; CHECK-LE-NEXT:    str r0, [sp]
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-LE-NEXT:    vmov r0, s0
@@ -420,7 +420,7 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_i64_v8i16_lane1(i64 %a) {
 ; CHECK-LE:       @ %bb.0:
 ; CHECK-LE-NEXT:    .pad #8
 ; CHECK-LE-NEXT:    sub sp, #8
-; CHECK-LE-NEXT:    strd r0, r1, [sp]
+; CHECK-LE-NEXT:    str r0, [sp]
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-LE-NEXT:    vmov r0, s1
diff --git a/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll b/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll
index 1f539f1652004..4eca61d08af7f 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll
@@ -27,8 +27,8 @@ define <2 x i2> @i2x2_splat(i1 %x) {
 
 ; CHECK-LABEL: not_a_vec:
 ; CHECK-NEXT: .functype not_a_vec (i64, i64) -> (v128){{$}}
-; CHECK-NEXT: i32.wrap_i64    $push[[L:[0-9]+]]=, $0
-; CHECK-NEXT: i32x4.splat     $push[[R:[0-9]+]]=, $pop[[L]]
+; CHECK-NEXT: i64x2.splat     $push[[L:[0-9]+]]=, $0
+; CHECK-NEXT: i8x16.shuffle   $push[[R:[0-9]+]]=, $pop[[L]], $2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK-NEXT: return $pop[[R]]
 define <4 x i32> @not_a_vec(i128 %x) {
   %a = bitcast i128 %x to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll
index 55fb2527722a4..5e31baa1ec72f 100644
--- a/llvm/test/CodeGen/X86/kmov.ll
+++ b/llvm/test/CodeGen/X86/kmov.ll
@@ -386,36 +386,28 @@ define <32 x i1> @invert_i32_mask_extract_32(i32 %mask) {
 define <32 x i1> @i64_mask_extract_32(i64 %mask) {
 ; X64-AVX512-LABEL: i64_mask_extract_32:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    movq %rdi, %rax
-; X64-AVX512-NEXT:    kmovd %eax, %k0
-; X64-AVX512-NEXT:    movzbl %ah, %ecx
-; X64-AVX512-NEXT:    kmovd %ecx, %k1
-; X64-AVX512-NEXT:    kunpckbw %k0, %k1, %k0
-; X64-AVX512-NEXT:    movl %eax, %ecx
-; X64-AVX512-NEXT:    shrl $24, %ecx
-; X64-AVX512-NEXT:    kmovd %ecx, %k1
-; X64-AVX512-NEXT:    shrl $16, %eax
-; X64-AVX512-NEXT:    movzbl %al, %eax
-; X64-AVX512-NEXT:    kmovd %eax, %k2
-; X64-AVX512-NEXT:    kunpckbw %k2, %k1, %k1
-; X64-AVX512-NEXT:    kunpckwd %k0, %k1, %k0
+; X64-AVX512-NEXT:    kmovq %rdi, %k0
+; X64-AVX512-NEXT:    kshiftrd $8, %k0, %k1
+; X64-AVX512-NEXT:    kunpckbw %k0, %k1, %k1
+; X64-AVX512-NEXT:    kshiftrd $16, %k0, %k2
+; X64-AVX512-NEXT:    kshiftrd $24, %k0, %k0
+; X64-AVX512-NEXT:    kunpckbw %k2, %k0, %k0
+; X64-AVX512-NEXT:    kunpckwd %k1, %k0, %k0
 ; X64-AVX512-NEXT:    vpmovm2b %k0, %ymm0
 ; X64-AVX512-NEXT:    retq
 ;
 ; X64-KNL-LABEL: i64_mask_extract_32:
 ; X64-KNL:       # %bb.0:
-; X64-KNL-NEXT:    movq %rdi, %rax
-; X64-KNL-NEXT:    movl %eax, %ecx
+; X64-KNL-NEXT:    movl %edi, %eax
+; X64-KNL-NEXT:    shrl $16, %eax
 ; X64-KNL-NEXT:    kmovw %eax, %k0
-; X64-KNL-NEXT:    movzbl %ah, %edx
-; X64-KNL-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-KNL-NEXT:    movl %edi, %eax
 ; X64-KNL-NEXT:    shrl $24, %eax
 ; X64-KNL-NEXT:    kmovw %eax, %k1
-; X64-KNL-NEXT:    shrl $16, %ecx
-; X64-KNL-NEXT:    movzbl %cl, %eax
-; X64-KNL-NEXT:    kmovw %eax, %k2
-; X64-KNL-NEXT:    kunpckbw %k2, %k1, %k1
-; X64-KNL-NEXT:    kmovw %edx, %k2
+; X64-KNL-NEXT:    kunpckbw %k0, %k1, %k1
+; X64-KNL-NEXT:    kmovw %edi, %k0
+; X64-KNL-NEXT:    shrl $8, %edi
+; X64-KNL-NEXT:    kmovw %edi, %k2
 ; X64-KNL-NEXT:    kunpckbw %k0, %k2, %k2
 ; X64-KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
 ; X64-KNL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -480,82 +472,56 @@ define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) {
 define <64 x i1> @i64_mask_extract_64(i64 %mask) {
 ; X64-AVX512-LABEL: i64_mask_extract_64:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    movq %rdi, %rax
-; X64-AVX512-NEXT:    kmovd %eax, %k0
-; X64-AVX512-NEXT:    movzbl %ah, %ecx
-; X64-AVX512-NEXT:    kmovd %ecx, %k1
-; X64-AVX512-NEXT:    kunpckbw %k0, %k1, %k0
-; X64-AVX512-NEXT:    movl %eax, %ecx
-; X64-AVX512-NEXT:    shrl $24, %ecx
-; X64-AVX512-NEXT:    kmovd %ecx, %k1
-; X64-AVX512-NEXT:    movl %eax, %ecx
-; X64-AVX512-NEXT:    shrl $16, %ecx
-; X64-AVX512-NEXT:    movzbl %cl, %ecx
-; X64-AVX512-NEXT:    kmovd %ecx, %k2
-; X64-AVX512-NEXT:    kunpckbw %k2, %k1, %k1
-; X64-AVX512-NEXT:    kunpckwd %k0, %k1, %k0
-; X64-AVX512-NEXT:    movq %rdi, %rcx
-; X64-AVX512-NEXT:    shrq $32, %rcx
-; X64-AVX512-NEXT:    movzbl %cl, %ecx
-; X64-AVX512-NEXT:    kmovd %ecx, %k1
-; X64-AVX512-NEXT:    movq %rdi, %rcx
-; X64-AVX512-NEXT:    shrq $40, %rcx
-; X64-AVX512-NEXT:    movzbl %cl, %ecx
-; X64-AVX512-NEXT:    kmovd %ecx, %k2
+; X64-AVX512-NEXT:    kmovq %rdi, %k0
+; X64-AVX512-NEXT:    kshiftrq $32, %k0, %k1
+; X64-AVX512-NEXT:    kshiftrq $40, %k0, %k2
 ; X64-AVX512-NEXT:    kunpckbw %k1, %k2, %k1
-; X64-AVX512-NEXT:    movq %rdi, %rcx
-; X64-AVX512-NEXT:    shrq $56, %rcx
-; X64-AVX512-NEXT:    kmovd %ecx, %k2
-; X64-AVX512-NEXT:    shrq $48, %rax
-; X64-AVX512-NEXT:    movzbl %al, %eax
-; X64-AVX512-NEXT:    kmovd %eax, %k3
-; X64-AVX512-NEXT:    kunpckbw %k3, %k2, %k2
+; X64-AVX512-NEXT:    kshiftrq $48, %k0, %k2
+; X64-AVX512-NEXT:    kshiftrq $56, %k0, %k3
+; X64-AVX512-NEXT:    kunpckbw %k2, %k3, %k2
 ; X64-AVX512-NEXT:    kunpckwd %k1, %k2, %k1
+; X64-AVX512-NEXT:    kshiftrd $8, %k0, %k2
+; X64-AVX512-NEXT:    kunpckbw %k0, %k2, %k2
+; X64-AVX512-NEXT:    kshiftrd $16, %k0, %k3
+; X64-AVX512-NEXT:    kshiftrd $24, %k0, %k0
+; X64-AVX512-NEXT:    kunpckbw %k3, %k0, %k0
+; X64-AVX512-NEXT:    kunpckwd %k2, %k0, %k0
 ; X64-AVX512-NEXT:    kunpckdq %k0, %k1, %k0
 ; X64-AVX512-NEXT:    vpmovm2b %k0, %zmm0
 ; X64-AVX512-NEXT:    retq
 ;
 ; X64-KNL-LABEL: i64_mask_extract_64:
 ; X64-KNL:       # %bb.0:
-; X64-KNL-NEXT:    pushq %rbx
-; X64-KNL-NEXT:    .cfi_def_cfa_offset 16
-; X64-KNL-NEXT:    .cfi_offset %rbx, -16
-; X64-KNL-NEXT:    movq %rsi, %rcx
 ; X64-KNL-NEXT:    movq %rdi, %rax
-; X64-KNL-NEXT:    movl %ecx, %edx
-; X64-KNL-NEXT:    movq %rsi, %rdi
-; X64-KNL-NEXT:    movq %rsi, %r8
-; X64-KNL-NEXT:    movq %rsi, %r9
-; X64-KNL-NEXT:    kmovw %ecx, %k0
-; X64-KNL-NEXT:    movzbl %ch, %ebx
-; X64-KNL-NEXT:    # kill: def $ecx killed $ecx killed $rcx
-; X64-KNL-NEXT:    shrl $24, %ecx
+; X64-KNL-NEXT:    kmovw %esi, %k0
+; X64-KNL-NEXT:    movl %esi, %ecx
+; X64-KNL-NEXT:    shrl $8, %ecx
+; X64-KNL-NEXT:    kmovw %ecx, %k1
+; X64-KNL-NEXT:    kunpckbw %k0, %k1, %k0
+; X64-KNL-NEXT:    movl %esi, %ecx
+; X64-KNL-NEXT:    shrl $16, %ecx
 ; X64-KNL-NEXT:    kmovw %ecx, %k1
-; X64-KNL-NEXT:    shrl $16, %edx
-; X64-KNL-NEXT:    movzbl %dl, %ecx
+; X64-KNL-NEXT:    movl %esi, %ecx
+; X64-KNL-NEXT:    shrl $24, %ecx
 ; X64-KNL-NEXT:    kmovw %ecx, %k2
-; X64-KNL-NEXT:    shrq $32, %rsi
-; X64-KNL-NEXT:    movzbl %sil, %ecx
+; X64-KNL-NEXT:    kunpckbw %k1, %k2, %k1
+; X64-KNL-NEXT:    movq %rsi, %rcx
+; X64-KNL-NEXT:    shrq $32, %rcx
+; X64-KNL-NEXT:    kmovw %ecx, %k2
+; X64-KNL-NEXT:    movq %rsi, %rcx
+; X64-KNL-NEXT:    shrq $40, %rcx
+; X64-KNL-NEXT:    kmovw %ecx, %k3
+; X64-KNL-NEXT:    kunpckbw %k2, %k3, %k2
+; X64-KNL-NEXT:    movq %rsi, %rcx
+; X64-KNL-NEXT:    shrq $48, %rcx
 ; X64-KNL-NEXT:    kmovw %ecx, %k3
-; X64-KNL-NEXT:    shrq $40, %rdi
-; X64-KNL-NEXT:    movzbl %dil, %ecx
-; X64-KNL-NEXT:    kmovw %ecx, %k4
-; X64-KNL-NEXT:    kunpckbw %k2, %k1, %k1
-; X64-KNL-NEXT:    shrq $56, %r8
-; X64-KNL-NEXT:    kmovw %r8d, %k2
+; X64-KNL-NEXT:    shrq $56, %rsi
+; X64-KNL-NEXT:    kmovw %esi, %k4
 ; X64-KNL-NEXT:    kunpckbw %k3, %k4, %k3
-; X64-KNL-NEXT:    shrq $48, %r9
-; X64-KNL-NEXT:    movzbl %r9b, %ecx
-; X64-KNL-NEXT:    kmovw %ecx, %k4
-; X64-KNL-NEXT:    kunpckbw %k4, %k2, %k2
-; X64-KNL-NEXT:    kmovw %ebx, %k4
-; X64-KNL-NEXT:    kunpckbw %k0, %k4, %k0
-; X64-KNL-NEXT:    kmovw %k0, (%rax)
-; X64-KNL-NEXT:    kmovw %k2, 6(%rax)
-; X64-KNL-NEXT:    kmovw %k3, 4(%rax)
-; X64-KNL-NEXT:    kmovw %k1, 2(%rax)
-; X64-KNL-NEXT:    popq %rbx
-; X64-KNL-NEXT:    .cfi_def_cfa_offset 8
+; X64-KNL-NEXT:    kmovw %k3, 6(%rdi)
+; X64-KNL-NEXT:    kmovw %k2, 4(%rdi)
+; X64-KNL-NEXT:    kmovw %k1, 2(%rdi)
+; X64-KNL-NEXT:    kmovw %k0, (%rdi)
 ; X64-KNL-NEXT:    retq
   %.splatinsert = insertelement <64 x i64> poison, i64 %mask, i64 0
   %.splat = shufflevector <64 x i64> %.splatinsert, <64 x i64> poison, <64 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index f0f430abc48dc..060bd1764d3c4 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -417,7 +417,6 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm3
 ; AVX512F-NEXT:    vucomiss %xmm3, %xmm2
 ; AVX512F-NEXT:    seta %al
-; AVX512F-NEXT:    negb %al
 ; AVX512F-NEXT:    kmovd %eax, %k1
 ; AVX512F-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0

>From 9bd2db0f2ef4d4814ee5f8923fa50c51f2fd3779 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 1 Apr 2025 07:56:26 +0100
Subject: [PATCH 2/2] Brackets and setBits

---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c249929d35d5e..0f38bbd46cbca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3167,12 +3167,13 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       // TODO - bigendian once we have test coverage.
       if (IsLE) {
         APInt DemandedSrcBits = APInt::getZero(SrcVT.getSizeInBits());
-        for (unsigned i = 0; i != NumElts; ++i)
-          if (DemandedElts[i]) {
-            unsigned Offset = i * VT.getScalarSizeInBits();
-            DemandedSrcBits.insertBits(
-                APInt::getAllOnes(VT.getScalarSizeInBits()), Offset);
+        unsigned EltSize = VT.getScalarSizeInBits();
+        for (unsigned I = 0; I != NumElts; ++I) {
+          if (DemandedElts[I]) {
+            unsigned Offset = I * EltSize;
+            DemandedSrcBits.setBits(Offset, Offset + EltSize);
           }
+        }
         KnownBits Known;
         if (SimplifyDemandedBits(Src, DemandedSrcBits, Known, TLO, Depth + 1))
           return true;