[clang] [llvm] [HLSL][DXIL][SPIRV] QuadReadLaneAt intrinsic support (PR #205735)

Fri Jun 26 05:42:22 PDT 2026

https://github.com/kcloudy0717 updated https://github.com/llvm/llvm-project/pull/205735

>From de9cdb6c854b65fbcf0f87c93b3ddfba1bc90ac2 Mon Sep 17 00:00:00 2001
From: Kai Huang <kcloudy0717 at gmail.com>
Date: Sun, 29 Mar 2026 00:25:42 +0800
Subject: [PATCH] [HLSL][DXIL][SPIRV] QuadReadLaneAt intrinsic support

---
 clang/include/clang/Basic/Builtins.td         |   6 +
 clang/include/clang/Basic/HLSLIntrinsics.td   |  16 ++
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          |   9 +
 clang/lib/CodeGen/CGHLSLRuntime.h             |   1 +
 clang/lib/Sema/SemaHLSL.cpp                   |   3 +-
 .../CodeGenHLSL/builtins/QuadReadLaneAt.hlsl  | 185 ++++++++++++++++++
 .../BuiltIns/QuadReadLaneAt-errors.hlsl       |  38 ++++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   1 +
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |   1 +
 llvm/lib/Target/DirectX/DXIL.td               |  10 +
 llvm/lib/Target/DirectX/DXILShaderFlags.cpp   |   1 +
 .../DirectX/DirectXTargetTransformInfo.cpp    |   1 +
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td       |   1 +
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |   3 +
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |   1 +
 llvm/test/CodeGen/DirectX/QuadReadLaneAt.ll   |  95 +++++++++
 .../QuadReadLaneAt.constant.ll                |  62 ++++++
 .../hlsl-intrinsics/QuadReadLaneAt.uniform.ll |  65 ++++++
 18 files changed, 498 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/QuadReadLaneAt.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/QuadReadLaneAt-errors.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/QuadReadLaneAt.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/QuadReadLaneAt.constant.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/QuadReadLaneAt.uniform.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 63cdb787bea16..b37725498fbdc 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5555,6 +5555,12 @@ def HLSLWavePrefixProduct : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLQuadReadLaneAt : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_quad_read_lane_at"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLQuadReadAcrossX : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_quad_read_across_x"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/include/clang/Basic/HLSLIntrinsics.td b/clang/include/clang/Basic/HLSLIntrinsics.td
index 99259046940f1..5dd2cd47b9756 100644
--- a/clang/include/clang/Basic/HLSLIntrinsics.td
+++ b/clang/include/clang/Basic/HLSLIntrinsics.td
@@ -1322,6 +1322,22 @@ def hlsl_pow : HLSLTwoArgBuiltin<"pow", "__builtin_elementwise_pow"> {
   let VaryingMatDims = [];
 }
 
+// Returns the value from the lane with the specified index in the quad.
+def hlsl_quad_read_lane_at : HLSLBuiltin<"QuadReadLaneAt", "__builtin_hlsl_quad_read_lane_at"> {
+  let Doc = [{
+\brief Returns the value from the lane with the specified index in the quad.
+\param Val The value to read.
+\param Index The lane index.
+}];
+  let ReturnType = Varying;
+  let Args = [Varying, UIntTy];
+  let VaryingTypes = AllTypesWithBool;
+  let VaryingScalar = 1;
+  let VaryingVecSizes = [2, 3, 4];
+  let VaryingMatDims = [];
+  let IsConvergent = 1;
+}
+
 // Reads the value from the lane across the X axis of the quad.
 def hlsl_quad_read_across_x :
     HLSLOneArgBuiltin<"QuadReadAcrossX",
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 20a2119e28ce1..ce3af3ffd0467 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -1557,6 +1557,15 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr},
                              "hlsl.wave.prefix.product");
   }
+  case Builtin::BI__builtin_hlsl_quad_read_lane_at: {
+    Value *OpExpr = EmitScalarExpr(E->getArg(0));
+    Value *OpIndex = EmitScalarExpr(E->getArg(1));
+    return EmitRuntimeCall(
+        Intrinsic::getOrInsertDeclaration(
+            &CGM.getModule(), CGM.getHLSLRuntime().getQuadReadLaneAtIntrinsic(),
+            {OpExpr->getType()}),
+        ArrayRef{OpExpr, OpIndex}, "hlsl.quad.read.lane.at");
+  }
   case Builtin::BI__builtin_hlsl_quad_read_across_x: {
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
     Intrinsic::ID ID = CGM.getHLSLRuntime().getQuadReadAcrossXIntrinsic();
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index a126d4612a5f4..a75ffdba04a8d 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -160,6 +160,7 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
   GENERATE_HLSL_INTRINSIC_FUNCTION(WaveGetLaneCount, wave_get_lane_count)
   GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(QuadReadLaneAt, quad_read_lane_at)
   GENERATE_HLSL_INTRINSIC_FUNCTION(QuadReadAcrossX, quad_read_across_x)
   GENERATE_HLSL_INTRINSIC_FUNCTION(QuadReadAcrossY, quad_read_across_y)
   GENERATE_HLSL_INTRINSIC_FUNCTION(QuadReadAcrossDiagonal,
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 075dc97b0aef2..55c96e4f525a8 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -4613,7 +4613,8 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
 
     break;
   }
-  case Builtin::BI__builtin_hlsl_wave_read_lane_at: {
+  case Builtin::BI__builtin_hlsl_wave_read_lane_at:
+  case Builtin::BI__builtin_hlsl_quad_read_lane_at: {
     if (SemaRef.checkArgCount(TheCall, 2))
       return true;
 
diff --git a/clang/test/CodeGenHLSL/builtins/QuadReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/QuadReadLaneAt.hlsl
new file mode 100644
index 0000000000000..9242b5075dc10
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/QuadReadLaneAt.hlsl
@@ -0,0 +1,185 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-compute %s -fnative-half-type -fnative-int16-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=dx -DCC=""
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=dx -DCC=""
+
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=spv -DCC="spir_func "
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=spv -DCC="spir_func " 
+
+// CHECK: %[[RET:.*]] = call [[CC]]i1 @llvm.[[TARGET]].quad.read.lane.at.i1(i1 %[[VAR:.*]], i32 %[[#]])
+// CHECK: ret i1 %[[RET]]
+bool test_bool(bool expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<2 x i1> @llvm.[[TARGET]].quad.read.lane.at.v2i1(<2 x i1> %[[VAR:.*]], i32 %[[#]])
+// CHECK: ret <2 x i1> %[[RET]]
+bool2 test_bool2(bool2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<3 x i1> @llvm.[[TARGET]].quad.read.lane.at.v3i1(<3 x i1> %[[VAR:.*]], i32 %[[#]])
+// CHECK: ret <3 x i1> %[[RET]]
+bool3 test_bool3(bool3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<4 x i1> @llvm.[[TARGET]].quad.read.lane.at.v4i1(<4 x i1> %[[VAR:.*]], i32 %[[#]])
+// CHECK: ret <4 x i1> %[[RET]]
+bool4 test_bool4(bool4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]i32 @llvm.[[TARGET]].quad.read.lane.at.i32(i32 %[[#]], i32 %[[#]])
+// CHECK: ret i32 %[[RET]]
+int test_int(int expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[TARGET]].quad.read.lane.at.v2i32(<2 x i32> %[[#]], i32 %[[#]])
+// CHECK: ret <2 x i32> %[[RET]]
+int2 test_int2(int2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[TARGET]].quad.read.lane.at.v3i32(<3 x i32> %[[#]], i32 %[[#]])
+// CHECK: ret <3 x i32> %[[RET]]
+int3 test_int3(int3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[TARGET]].quad.read.lane.at.v4i32(<4 x i32> %[[#]], i32 %[[#]])
+// CHECK: ret <4 x i32> %[[RET]]
+int4 test_int4(int4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]i32 @llvm.[[TARGET]].quad.read.lane.at.i32(i32 %[[#]], i32 %[[#]])
+// CHECK: ret i32 %[[RET]]
+uint test_uint(uint expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[TARGET]].quad.read.lane.at.v2i32(<2 x i32> %[[#]], i32 %[[#]])
+// CHECK: ret <2 x i32> %[[RET]]
+uint2 test_uint2(uint2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[TARGET]].quad.read.lane.at.v3i32(<3 x i32> %[[#]], i32 %[[#]])
+// CHECK: ret <3 x i32> %[[RET]]
+uint3 test_uint3(uint3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[TARGET]].quad.read.lane.at.v4i32(<4 x i32> %[[#]], i32 %[[#]])
+// CHECK: ret <4 x i32> %[[RET]]
+uint4 test_uint4(uint4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[TARGET]].quad.read.lane.at.i64(i64 %[[#]], i32 %[[#]])
+// CHECK: ret i64 %[[RET]]
+int64_t test_int64_t(int64_t expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[TARGET]].quad.read.lane.at.v2i64(<2 x i64> %[[#]], i32 %[[#]])
+// CHECK: ret <2 x i64> %[[RET]]
+int64_t2 test_int64_t2(int64_t2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[TARGET]].quad.read.lane.at.v3i64(<3 x i64> %[[#]], i32 %[[#]])
+// CHECK: ret <3 x i64> %[[RET]]
+int64_t3 test_int64_t3(int64_t3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[TARGET]].quad.read.lane.at.v4i64(<4 x i64> %[[#]], i32 %[[#]])
+// CHECK: ret <4 x i64> %[[RET]]
+int64_t4 test_int64_t4(int64_t4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[TARGET]].quad.read.lane.at.i64(i64 %[[#]], i32 %[[#]])
+// CHECK: ret i64 %[[RET]]
+uint64_t test_uint64_t(uint64_t expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[TARGET]].quad.read.lane.at.v2i64(<2 x i64> %[[#]], i32 %[[#]])
+// CHECK: ret <2 x i64> %[[RET]]
+uint64_t2 test_uint64_t2(uint64_t2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[TARGET]].quad.read.lane.at.v3i64(<3 x i64> %[[#]], i32 %[[#]])
+// CHECK: ret <3 x i64> %[[RET]]
+uint64_t3 test_uint64_t3(uint64_t3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[TARGET]].quad.read.lane.at.v4i64(<4 x i64> %[[#]], i32 %[[#]])
+// CHECK: ret <4 x i64> %[[RET]]
+uint64_t4 test_uint64_t4(uint64_t4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[TARGET]].quad.read.lane.at.f32(float %[[#]], i32 %[[#]])
+// CHECK: ret float %[[RET]]
+float test_float(float expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[TARGET]].quad.read.lane.at.v2f32(<2 x float> %[[#]], i32 %[[#]])
+// CHECK: ret <2 x float> %[[RET]]
+float2 test_float2(float2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[TARGET]].quad.read.lane.at.v3f32(<3 x float> %[[#]], i32 %[[#]])
+// CHECK: ret <3 x float> %[[RET]]
+float3 test_float3(float3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[TARGET]].quad.read.lane.at.v4f32(<4 x float> %[[#]], i32 %[[#]])
+// CHECK: ret <4 x float> %[[RET]]
+float4 test_float4(float4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]double @llvm.[[TARGET]].quad.read.lane.at.f64(double %[[#]], i32 %[[#]])
+// CHECK: ret double %[[RET]]
+double test_double(double expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x double> @llvm.[[TARGET]].quad.read.lane.at.v2f64(<2 x double> %[[#]], i32 %[[#]])
+// CHECK: ret <2 x double> %[[RET]]
+double2 test_double2(double2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x double> @llvm.[[TARGET]].quad.read.lane.at.v3f64(<3 x double> %[[#]], i32 %[[#]])
+// CHECK: ret <3 x double> %[[RET]]
+double3 test_double3(double3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x double> @llvm.[[TARGET]].quad.read.lane.at.v4f64(<4 x double> %[[#]], i32 %[[#]])
+// CHECK: ret <4 x double> %[[RET]]
+double4 test_double4(double4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]half @llvm.[[TARGET]].quad.read.lane.at.f16(half %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret half %[[RET]]
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[TARGET]].quad.read.lane.at.f32(float %[[#]], i32 %[[#]])
+// CHECK-NO_HALF: ret float %[[RET]]
+half test_half(half expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x half> @llvm.[[TARGET]].quad.read.lane.at.v2f16(<2 x half> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <2 x half> %[[RET]]
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[TARGET]].quad.read.lane.at.v2f32(<2 x float> %[[#]], i32 %[[#]])
+// CHECK-NO_HALF: ret <2 x float> %[[RET]]
+half2 test_half2(half2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x half> @llvm.[[TARGET]].quad.read.lane.at.v3f16(<3 x half> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <3 x half> %[[RET]]
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[TARGET]].quad.read.lane.at.v3f32(<3 x float> %[[#]], i32 %[[#]])
+// CHECK-NO_HALF: ret <3 x float> %[[RET]]
+half3 test_half3(half3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x half> @llvm.[[TARGET]].quad.read.lane.at.v4f16(<4 x half> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <4 x half> %[[RET]]
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[TARGET]].quad.read.lane.at.v4f32(<4 x float> %[[#]], i32 %[[#]])
+// CHECK-NO_HALF: ret <4 x float> %[[RET]]
+half4 test_half4(half4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+#ifdef __HLSL_ENABLE_16_BIT
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[TARGET]].quad.read.lane.at.i16(i16 %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret i16 %[[RET]]
+int16_t test_int16_t(int16_t expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[TARGET]].quad.read.lane.at.v2i16(<2 x i16> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]]
+int16_t2 test_int16_t2(int16_t2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[TARGET]].quad.read.lane.at.v3i16(<3 x i16> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]]
+int16_t3 test_int16_t3(int16_t3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[TARGET]].quad.read.lane.at.v4i16(<4 x i16> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]]
+int16_t4 test_int16_t4(int16_t4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[TARGET]].quad.read.lane.at.i16(i16 %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret i16 %[[RET]]
+uint16_t test_uint16_t(uint16_t expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[TARGET]].quad.read.lane.at.v2i16(<2 x i16> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]]
+uint16_t2 test_uint16_t2(uint16_t2 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[TARGET]].quad.read.lane.at.v3i16(<3 x i16> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]]
+uint16_t3 test_uint16_t3(uint16_t3 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[TARGET]].quad.read.lane.at.v4i16(<4 x i16> %[[#]], i32 %[[#]])
+// CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]]
+uint16_t4 test_uint16_t4(uint16_t4 expr, uint idx) { return QuadReadLaneAt(expr, idx); }
+#endif
diff --git a/clang/test/SemaHLSL/BuiltIns/QuadReadLaneAt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/QuadReadLaneAt-errors.hlsl
new file mode 100644
index 0000000000000..38d22b1f44772
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/QuadReadLaneAt-errors.hlsl
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -verify
+
+bool test_too_few_arg() {
+  return __builtin_hlsl_quad_read_lane_at();
+  // expected-error at -1 {{too few arguments to function call, expected 2, have 0}}
+}
+
+float2 test_too_few_arg_1(float2 p0) {
+  return __builtin_hlsl_quad_read_lane_at(p0);
+  // expected-error at -1 {{too few arguments to function call, expected 2, have 1}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_quad_read_lane_at(p0, p0, p0);
+  // expected-error at -1 {{too many arguments to function call, expected 2, have 3}}
+}
+
+float3 test_index_double_type_check(float3 p0, double idx) {
+  return __builtin_hlsl_quad_read_lane_at(p0, idx);
+  // expected-error at -1 {{passing 'double' to parameter of incompatible type 'unsigned int'}}
+}
+
+float3 test_index_int3_type_check(float3 p0, int3 idxs) {
+  return __builtin_hlsl_quad_read_lane_at(p0, idxs);
+  // expected-error at -1 {{passing 'int3' (aka 'vector<int, 3>') to parameter of incompatible type 'unsigned int'}}
+}
+
+struct S { float f; };
+
+float3 test_index_S_type_check(float3 p0, S idx) {
+  return __builtin_hlsl_quad_read_lane_at(p0, idx);
+  // expected-error at -1 {{passing 'S' to parameter of incompatible type 'unsigned int'}}
+}
+
+S test_expr_struct_type_check(S p0, int idx) {
+  return __builtin_hlsl_quad_read_lane_at(p0, idx);
+  // expected-error at -1 {{invalid operand of type 'S' where a scalar or vector is required}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index af360dfc78965..cbcce3bd6b6bc 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -277,6 +277,7 @@ def int_dx_wave_prefix_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType
 def int_dx_wave_prefix_usum : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
 def int_dx_wave_prefix_product : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
 def int_dx_wave_prefix_uproduct : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
+def int_dx_quad_read_lane_at : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
 def int_dx_quad_read_across_x : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
 def int_dx_quad_read_across_y : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
 def int_dx_quad_read_across_diagonal : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 6e4cf8f7e72dc..8c76ef3f9e6c7 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -166,6 +166,7 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
       : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent]>;
   def int_spv_wave_prefix_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_prefix_product : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+  def int_spv_quad_read_lane_at : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_quad_read_across_x : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_quad_read_across_y : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_quad_read_across_diagonal : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 299d2d113b6bf..7ab9f99a911bb 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1310,6 +1310,16 @@ def WavePrefixOp : DXILOp<121, wavePrefixOp> {
   let attributes = [Attributes<DXIL1_0, []>];
 }
 
+def QuadReadLaneAt : DXILOp<122, quadReadLaneAt> {
+  let Doc = "returns the value from the specified lane in the quad";
+  let intrinsics = [IntrinSelect<int_dx_quad_read_lane_at>];
+  let arguments = [OverloadTy, Int32Ty];
+  let result = OverloadTy;
+  let overloads = [Overloads<
+      DXIL1_0, [HalfTy, FloatTy, DoubleTy, Int1Ty, Int16Ty, Int32Ty, Int64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def QuadOp : DXILOp<123, quadOp> {
   let Doc = "returns the value from another lane within the quad by swapping values in a direction";
   let intrinsics = [
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index 3a9ff9f62361a..da699667efcd1 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -107,6 +107,7 @@ static bool checkWaveOps(Intrinsic::ID IID) {
   case Intrinsic::dx_wave_prefix_product:
   case Intrinsic::dx_wave_prefix_uproduct:
     // Quad Op Variants
+  case Intrinsic::dx_quad_read_lane_at:
   case Intrinsic::dx_quad_read_across_x:
   case Intrinsic::dx_quad_read_across_y:
   case Intrinsic::dx_quad_read_across_diagonal:
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index af1d7bc452126..292007796b682 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -19,6 +19,7 @@ bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(
     Intrinsic::ID ID, unsigned ScalarOpdIdx) const {
   switch (ID) {
   case Intrinsic::dx_wave_readlane:
+  case Intrinsic::dx_quad_read_lane_at:
     return ScalarOpdIdx == 1;
   default:
     return false;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 66e5d2f6a626e..64379cf98987a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -855,6 +855,7 @@ def OpGroupNonUniformBitwiseXor: OpGroupNUGroup<"BitwiseXor", 361>;
 def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
 def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
 def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
+def OpGroupNonUniformQuadBroadcast : OpGroupNU4<"QuadBroadcast", 365>;
 def OpGroupNonUniformQuadSwap: OpGroupNU4<"QuadSwap", 366>;
 
 // SPV_KHR_subgroup_rotate
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index cd99015a61ba9..70cc025186aff 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -5382,6 +5382,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectWaveExclusiveScanSum(ResVReg, ResType, I);
   case Intrinsic::spv_wave_prefix_product:
     return selectWaveExclusiveScanProduct(ResVReg, ResType, I);
+  case Intrinsic::spv_quad_read_lane_at:
+    return selectWaveOpInst(ResVReg, ResType, I,
+                            SPIRV::OpGroupNonUniformQuadBroadcast);
   case Intrinsic::spv_quad_read_across_x: {
     return selectQuadSwap(ResVReg, ResType, I, /*Direction*/ 0);
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index aed16fd785af8..697b47d07866d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1832,6 +1832,7 @@ void addInstrRequirements(const MachineInstr &MI,
     }
     break;
   }
+  case SPIRV::OpGroupNonUniformQuadBroadcast:
   case SPIRV::OpGroupNonUniformQuadSwap:
     Reqs.addCapability(SPIRV::Capability::GroupNonUniformQuad);
     break;
diff --git a/llvm/test/CodeGen/DirectX/QuadReadLaneAt.ll b/llvm/test/CodeGen/DirectX/QuadReadLaneAt.ll
new file mode 100644
index 0000000000000..1ee2703c94ae0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/QuadReadLaneAt.ll
@@ -0,0 +1,95 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s
+
+; Test that for scalar values, QuadReadLaneAt maps down to the DirectX op
+
+define noundef i1 @quad_read_lane_at_bool(i1 noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i1 @dx.op.quadReadLaneAt.i1(i32 122, i1 %expr, i32 %idx)
+  %ret = call i1 @llvm.dx.quad.read.lane.at.i1(i1 %expr, i32 %idx)
+  ret i1 %ret
+}
+
+define noundef half @quad_read_lane_at_half(half noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call half @dx.op.quadReadLaneAt.f16(i32 122, half %expr, i32 %idx)
+  %ret = call half @llvm.dx.quad.read.lane.at.f16(half %expr, i32 %idx)
+  ret half %ret
+}
+
+define noundef float @quad_read_lane_at_float(float noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call float @dx.op.quadReadLaneAt.f32(i32 122, float %expr, i32 %idx)
+  %ret = call float @llvm.dx.quad.read.lane.at.f32(float %expr, i32 %idx)
+  ret float %ret
+}
+
+define noundef double @quad_read_lane_at_double(double noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call double @dx.op.quadReadLaneAt.f64(i32 122, double %expr, i32 %idx)
+  %ret = call double @llvm.dx.quad.read.lane.at.f64(double %expr, i32 %idx)
+  ret double %ret
+}
+
+define noundef i16 @quad_read_lane_at_i16(i16 noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i16 @dx.op.quadReadLaneAt.i16(i32 122, i16 %expr, i32 %idx)
+  %ret = call i16 @llvm.dx.quad.read.lane.at.i16(i16 %expr, i32 %idx)
+  ret i16 %ret
+}
+
+define noundef i32 @quad_read_lane_at_i32(i32 noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i32 @dx.op.quadReadLaneAt.i32(i32 122, i32 %expr, i32 %idx)
+  %ret = call i32 @llvm.dx.quad.read.lane.at.i32(i32 %expr, i32 %idx)
+  ret i32 %ret
+}
+
+define noundef i64 @quad_read_lane_at_i64(i64 noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i64 @dx.op.quadReadLaneAt.i64(i32 122, i64 %expr, i32 %idx)
+  %ret = call i64 @llvm.dx.quad.read.lane.at.i64(i64 %expr, i32 %idx)
+  ret i64 %ret
+}
+
+declare i1 @llvm.dx.quad.read.lane.at.i1(i1, i32)
+declare half @llvm.dx.quad.read.lane.at.f16(half, i32)
+declare float @llvm.dx.quad.read.lane.at.f32(float, i32)
+declare double @llvm.dx.quad.read.lane.at.f64(double, i32)
+
+declare i16 @llvm.dx.quad.read.lane.at.i16(i16, i32)
+declare i32 @llvm.dx.quad.read.lane.at.i32(i32, i32)
+declare i64 @llvm.dx.quad.read.lane.at.i64(i64, i32)
+
+; Test that for vector values, QuadReadLaneAt scalarizes and maps down to the
+; DirectX op
+
+define noundef <2 x half> @quad_read_lane_at_v2half(<2 x half> noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call half @dx.op.quadReadLaneAt.f16(i32 122, half %expr.i0, i32 %idx)
+; CHECK: call half @dx.op.quadReadLaneAt.f16(i32 122, half %expr.i1, i32 %idx)
+  %ret = call <2 x half> @llvm.dx.quad.read.lane.at.v2f16(<2 x half> %expr, i32 %idx)
+  ret <2 x half> %ret
+}
+
+define noundef <3 x i32> @quad_read_lane_at_v3i32(<3 x i32> noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i32 @dx.op.quadReadLaneAt.i32(i32 122, i32 %expr.i0, i32 %idx)
+; CHECK: call i32 @dx.op.quadReadLaneAt.i32(i32 122, i32 %expr.i1, i32 %idx)
+; CHECK: call i32 @dx.op.quadReadLaneAt.i32(i32 122, i32 %expr.i2, i32 %idx)
+  %ret = call <3 x i32> @llvm.dx.quad.read.lane.at.v3i32(<3 x i32> %expr, i32 %idx)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x double> @quad_read_lane_at_v4f64(<4 x double> noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call double @dx.op.quadReadLaneAt.f64(i32 122, double %expr.i0, i32 %idx)
+; CHECK: call double @dx.op.quadReadLaneAt.f64(i32 122, double %expr.i1, i32 %idx)
+; CHECK: call double @dx.op.quadReadLaneAt.f64(i32 122, double %expr.i2, i32 %idx)
+; CHECK: call double @dx.op.quadReadLaneAt.f64(i32 122, double %expr.i3, i32 %idx)
+  %ret = call <4 x double> @llvm.dx.quad.read.lane.at.v4f64(<4 x double> %expr, i32 %idx)
+  ret <4 x double> %ret
+}
+
+declare <2 x half> @llvm.dx.quad.read.lane.at.v2f16(<2 x half>, i32)
+declare <3 x i32> @llvm.dx.quad.read.lane.at.v3i32(<3 x i32>, i32)
+declare <4 x double> @llvm.dx.quad.read.lane.at.v4f64(<4 x double>, i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/QuadReadLaneAt.constant.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/QuadReadLaneAt.constant.ll
new file mode 100644
index 0000000000000..0e4258ba65a6d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/QuadReadLaneAt.constant.ll
@@ -0,0 +1,62 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan1.3 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan1.3 %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+; Test lowering to spir-v backend for various types and scalar/vector
+; This tests pre SPIRV 1.5 where index must be a constant
+
+; CHECK: OpCapability GroupNonUniformQuad
+
+; CHECK-DAG:   %[[#bool:]] = OpTypeBool
+; CHECK-DAG:   %[[#f16:]] = OpTypeFloat 16
+; CHECK-DAG:   %[[#f32:]] = OpTypeFloat 32
+; CHECK-DAG:   %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:   %[[#v4_half:]] = OpTypeVector %[[#f16]] 4
+; CHECK-DAG:   %[[#scope:]] = OpConstant %[[#uint]] 3
+; CHECK-DAG:   %[[#uint_0:]] = OpConstant %[[#uint]] 0
+
+; CHECK-LABEL: Begin function test_bool
+; CHECK:   %[[#bexpr:]] = OpFunctionParameter %[[#bool]]
+define internal i1 @test_bool(i1 %bexpr) {
+entry:
+; CHECK:   %[[#bret:]] = OpGroupNonUniformQuadBroadcast %[[#bool]] %[[#scope]] %[[#bexpr]] %[[#uint_0]]
+  %0 = call i1 @llvm.spv.quad.read.lane.at.i1(i1 %bexpr, i32 0)
+  ret i1 %0
+}
+
+; CHECK-LABEL: Begin function test_float
+; CHECK:   %[[#fexpr:]] = OpFunctionParameter %[[#f32]]
+define internal float @test_float(float %fexpr) {
+entry:
+; CHECK:   %[[#fret:]] = OpGroupNonUniformQuadBroadcast %[[#f32]] %[[#scope]] %[[#fexpr]] %[[#uint_0]]
+  %0 = call float @llvm.spv.quad.read.lane.at.f32(float %fexpr, i32 0)
+  ret float %0
+}
+
+; CHECK-LABEL: Begin function test_int
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define internal i32 @test_int(i32 %iexpr) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformQuadBroadcast %[[#uint]] %[[#scope]] %[[#iexpr]] %[[#uint_0]]
+  %0 = call i32 @llvm.spv.quad.read.lane.at.i32(i32 %iexpr, i32 0)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_vhalf
+; CHECK:   %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]]
+define internal <4 x half> @test_vhalf(<4 x half> %vbexpr) {
+entry:
+; CHECK:   %[[#vhalfret:]] = OpGroupNonUniformQuadBroadcast %[[#v4_half]] %[[#scope]] %[[#vbexpr]] %[[#uint_0]]
+  %0 = call <4 x half> @llvm.spv.quad.read.lane.at.v4half(<4 x half> %vbexpr, i32 0)
+  ret <4 x half> %0
+}
+
+define void @main() #0 {
+  ret void
+}
+
+declare i1 @llvm.spv.quad.read.lane.at.i1(i1, i32)
+declare float @llvm.spv.quad.read.lane.at.f32(float, i32)
+declare i32 @llvm.spv.quad.read.lane.at.i32(i32, i32)
+declare <4 x half> @llvm.spv.quad.read.lane.at.v4half(<4 x half>, i32)
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/QuadReadLaneAt.uniform.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/QuadReadLaneAt.uniform.ll
new file mode 100644
index 0000000000000..f3818a6828f86
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/QuadReadLaneAt.uniform.ll
@@ -0,0 +1,65 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.5-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering to spir-v backend for various types and scalar/vector
+; This tests SPIRV 1.5 where index must be dynamically uniform
+
+; CHECK: OpCapability GroupNonUniformQuad
+
+; CHECK-DAG:   %[[#bool:]] = OpTypeBool
+; CHECK-DAG:   %[[#f16:]] = OpTypeFloat 16
+; CHECK-DAG:   %[[#f32:]] = OpTypeFloat 32
+; CHECK-DAG:   %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:   %[[#v4_half:]] = OpTypeVector %[[#f16]] 4
+; CHECK-DAG:   %[[#scope:]] = OpConstant %[[#uint]] 3
+
+; CHECK-LABEL: Begin function test_bool
+; CHECK:   %[[#bexpr:]] = OpFunctionParameter %[[#bool]]
+; CHECK:   %[[#idx:]] = OpFunctionParameter %[[#uint]]
+define internal i1 @test_bool(i1 %bexpr, i32 %idx) {
+entry:
+; CHECK:   %[[#bret:]] = OpGroupNonUniformQuadBroadcast %[[#bool]] %[[#scope]] %[[#bexpr]] %[[#idx]]
+  %0 = call i1 @llvm.spv.quad.read.lane.at.i1(i1 %bexpr, i32 %idx)
+  ret i1 %0
+}
+
+; CHECK-LABEL: Begin function test_float
+; CHECK:   %[[#fexpr:]] = OpFunctionParameter %[[#f32]]
+; CHECK:   %[[#idx:]] = OpFunctionParameter %[[#uint]]
+define internal float @test_float(float %fexpr, i32 %idx) {
+entry:
+; CHECK:   %[[#fret:]] = OpGroupNonUniformQuadBroadcast %[[#f32]] %[[#scope]] %[[#fexpr]] %[[#idx]]
+  %0 = call float @llvm.spv.quad.read.lane.at.f32(float %fexpr, i32 %idx)
+  ret float %0
+}
+
+; CHECK-LABEL: Begin function test_int
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+; CHECK:   %[[#idx:]] = OpFunctionParameter %[[#uint]]
+define internal i32 @test_int(i32 %iexpr, i32 %idx) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformQuadBroadcast %[[#uint]] %[[#scope]] %[[#iexpr]] %[[#idx]]
+  %0 = call i32 @llvm.spv.quad.read.lane.at.i32(i32 %iexpr, i32 %idx)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_vhalf
+; CHECK:   %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]]
+; CHECK:   %[[#idx:]] = OpFunctionParameter %[[#uint]]
+define internal <4 x half> @test_vhalf(<4 x half> %vbexpr, i32 %idx) {
+entry:
+; CHECK:   %[[#vhalfret:]] = OpGroupNonUniformQuadBroadcast %[[#v4_half]] %[[#scope]] %[[#vbexpr]] %[[#idx]]
+  %0 = call <4 x half> @llvm.spv.quad.read.lane.at.v4half(<4 x half> %vbexpr, i32 %idx)
+  ret <4 x half> %0
+}
+
+define void @main() #0 {
+  ret void
+}
+
+declare i1 @llvm.spv.quad.read.lane.at.i1(i1, i32)
+declare float @llvm.spv.quad.read.lane.at.f32(float, i32)
+declare i32 @llvm.spv.quad.read.lane.at.i32(i32, i32)
+declare <4 x half> @llvm.spv.quad.read.lane.at.v4half(<4 x half>, i32)
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }