Coverage for src/flag_gems/runtime/backend/_tsingmicro/ops/matmul

3# Permission is hereby granted, free of charge, to any person obtaining a copy

4# of this software and associated documentation files (the "Software"), to deal

5# in the Software without restriction, including without limitation the rights

6# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

7# copies of the Software, and to permit persons to whom the Software is

8# furnished to do so, subject to the following conditions:

10# The above copyright notice and this permission notice shall be included in

11# all copies or substantial portions of the Software.

12#

13# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

14# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

15# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

16# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

17# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

18# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN

19# THE SOFTWARE.

21"""

22Matrix Multiplication

23===============

24"""

26import torch

27import triton

28import triton.language as tl

30DEV = "txda"

33def get_output_dtype(a_dtype, b_dtype):

34 # After view to int32, the dtype is int32

35 return torch.bfloat16

38def get_autotune_config():

39 return [

40 triton.Config({"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}),

41 triton.Config({"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}),

42 triton.Config({"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 256}),

43 ]

46@triton.autotune(

47 configs=get_autotune_config(),

48 key=["M", "N", "K"],

49)

50@triton.jit

51def matmul_kernel(

52 # Pointers to matrices

53 a_ptr,

54 b_ptr,

55 c_ptr,

56 # Matrix dimensions

57 M,

58 N,

59 K,

60 # The stride variables represent how much to increase the ptr by when moving by 1

61 # element in a particular dimension.

62 stride_am,

63 stride_ak, #

64 stride_bk,

65 stride_bn, #

66 stride_cm,

67 stride_cn,

68 # Meta-parameters

69 BLOCK_SIZE_M: tl.constexpr,

70 BLOCK_SIZE_N: tl.constexpr,

71 BLOCK_SIZE_K: tl.constexpr, #

72):

73 """Kernel for computing the matmul C = A x B.

74 A has shape (M, K), B has shape (K, N) and C has shape (M, N)

75 """

76 # L2 Cache Optimization: Group multiple M-blocks together to reuse B columns

77 # GROUP_SIZE_M=8 means 8 consecutive M-blocks share the same B columns in L2 cache

78 GROUP_SIZE_M: tl.constexpr = 8

79 # -----------------------------------------------------------

80 # Map program ids `pid` to the block of C it should compute.

81 # This is done in a grouped ordering to promote L2 data reuse.

82 # See above `L2 Cache Optimizations` section for details.

83 pid = tl.program_id(axis=0)

84 num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)

85 num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)

86 num_pid_in_group = GROUP_SIZE_M * num_pid_n

87 group_id = pid // num_pid_in_group

88 first_pid_m = group_id * GROUP_SIZE_M

89 group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)

90 pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)

91 pid_n = (pid % num_pid_in_group) // group_size_m

93 # ----------------------------------------------------------

94 # Create block pointers for A, B, and C using make_block_ptr.

95 a_block_ptr = tl.make_block_ptr(

96 base=a_ptr,

97 shape=(M, K),

98 strides=(stride_am, stride_ak),

99 offsets=(pid_m * BLOCK_SIZE_M, 0),

100 block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_K),

101 order=(1, 0),

102 )

103 b_block_ptr = tl.make_block_ptr(

104 base=b_ptr,

105 shape=(K, N),

106 strides=(stride_bk, stride_bn),

107 offsets=(0, pid_n * BLOCK_SIZE_N),

108 block_shape=(BLOCK_SIZE_K, BLOCK_SIZE_N),

109 order=(1, 0),

110 )

111 # -----------------------------------------------------------

112 # Iterate to compute a block of the C matrix.

113 # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block

114 # of fp32 values for higher accuracy.

115 accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)

116 for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):

117 a = tl.load(a_block_ptr, boundary_check=(0, 1), padding_option="zero")

118 b = tl.load(b_block_ptr, boundary_check=(0, 1), padding_option="zero")

119 accumulator += tl.dot(a, b, out_dtype=tl.float32, allow_tf32=False)

120 a_block_ptr = tl.advance(a_block_ptr, (0, BLOCK_SIZE_K))

121 b_block_ptr = tl.advance(b_block_ptr, (BLOCK_SIZE_K, 0))

122 c = accumulator.to(c_ptr.dtype.element_ty)

123 # -----------------------------------------------------------

124 # Write back the block of the output matrix C.

125 c_block_ptr = tl.make_block_ptr(

126 base=c_ptr,

127 shape=(M, N),

128 strides=(stride_cm, stride_cn),

129 offsets=(pid_m * BLOCK_SIZE_M, pid_n * BLOCK_SIZE_N),

130 block_shape=(BLOCK_SIZE_M, BLOCK_SIZE_N),

131 order=(1, 0),

132 )

133 tl.store(c_block_ptr, c, boundary_check=(0, 1))

134

135

136def torch_matmul(a, b):

137 print(f"{a.dtype=} {b.dtype=}")

138 # b is (N, K), so b.t() gives (K, N)

139 c = torch.matmul(a.to(torch.bfloat16), b.to(torch.bfloat16).t())

140 return c

141

142

143# %%

144# We can now create a convenience wrapper function that only takes two input tensors,

145# and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel.

146

147

148def matmul_int8(a, b):

149 # Save original shape for 3D support

150 a_shape = a.shape

151 if a.ndim == 3:

152 a = a.contiguous().reshape(-1, a.shape[-1])

153 # Handle non-contiguous inputs if necessary

154 if a.stride(0) > 1 and a.stride(1) > 1:

155 a = a.contiguous()

156 # b has shape (N, K), transpose to (K, N) contiguous for the kernel

157 b = b.t().contiguous()

158 # Check constraints. After transpose, b has shape (K, N)

159 assert a.shape[1] == b.shape[0], "Incompatible dimensions"

160 M, K = a.shape

161 N = b.shape[1]

162 # Convert int8 to bfloat16 for matrix multiplication

163 if a.dtype == torch.int8:

164 a = a.to(torch.bfloat16)

165 b = b.to(torch.bfloat16)

166 # Allocates output.

167 c_dtype = get_output_dtype(a.dtype, b.dtype)

168 c = torch.empty((M, N), device=a.device, dtype=c_dtype)

169 # 1D launch kernel where each block gets its own program.

170 grid = lambda META: (

171 triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),

172 )

173 matmul_kernel[grid](

174 a,

175 b,

176 c, #

177 M,

178 N,

179 K, #

180 a.stride(0),

181 a.stride(1), #

182 b.stride(0),

183 b.stride(1),

184 c.stride(0),

185 c.stride(1), #

186 )

187 # Reshape output back if input was 3D

188 if len(a_shape) == 3:

189 c = c.reshape(*a_shape[:-1], N)

190 return c

Coverage for src/flag_gems/runtime/backend/_tsingmicro/ops/matmul_int8.py: 0%

57 statements