Coverage for src/flag_gems/ops/reflection

1import logging

2import math

4import torch

5import triton

6import triton.language as tl

8import flag_gems

10logger = logging.getLogger(__name__)

13@triton.jit

14def reflection_pad2d_kernel(

15 in_ptr,

16 out_ptr,

17 B,

18 H_in,

19 W_in,

20 pad_left,

21 pad_top,

22 H_out,

23 W_out,

24 BLOCK_HW: tl.constexpr,

25):

26 pid_b = tl.program_id(axis=0)

27 pid_n = tl.program_id(axis=1)

29 # Flatten 2D index to 1D for block processing

30 offs_n = pid_n * BLOCK_HW + tl.arange(0, BLOCK_HW)

31 # Decode to (h, w) coordinates

32 h_idx = offs_n // W_out

33 w_idx = offs_n % W_out

35 mask = (offs_n < H_out * W_out) & (pid_b < B)

37 base_in = pid_b * (H_in * W_in)

38 base_out = pid_b * (H_out * W_out)

40 # Compute reflected indices for height

41 y = h_idx.to(tl.int32) - pad_top

42 Hm1 = H_in - 1

43 pH = 2 * Hm1

44 t_h = tl.abs(y)

45 m_h = t_h % pH

46 ih = tl.where(m_h < H_in, m_h, pH - m_h)

48 # Compute reflected indices for width

49 x = w_idx.to(tl.int32) - pad_left

50 Wm1 = W_in - 1

51 pW = 2 * Wm1

52 t_w = tl.abs(x)

53 m_w = t_w % pW

54 iw = tl.where(m_w < W_in, m_w, pW - m_w)

56 # Load from input and store to output

57 in_offs = ih * W_in + iw

58 vals = tl.load(in_ptr + base_in + in_offs, mask=mask, other=0)

59 tl.store(out_ptr + base_out + offs_n, vals, mask=mask)

62@triton.jit

63def copy_tensor_kernel(in_ptr, out_ptr, B, H, W, BLOCK_HW: tl.constexpr):

64 pid_b = tl.program_id(axis=0)

65 pid_n = tl.program_id(axis=1)

67 offs_n = pid_n * BLOCK_HW + tl.arange(0, BLOCK_HW)

68 mask = (offs_n < H * W) & (pid_b < B)

70 base = pid_b * (H * W)

71 vals = tl.load(in_ptr + base + offs_n, mask=mask, other=0)

72 tl.store(out_ptr + base + offs_n, vals, mask=mask)

75def launch_reflection_pad2d(input: torch.Tensor, padding, out: torch.Tensor = None):

76 # Validate padding format

77 if not isinstance(padding, (list, tuple)):

78 raise ValueError("padding must be a sequence")

79 if len(padding) != 4:

80 raise ValueError(

81 "padding must be a sequence of length 4: (pad_left, pad_right, pad_top, pad_bottom)"

82 )

83 pad_left, pad_right, pad_top, pad_bottom = [int(p) for p in padding]

85 # Validate padding values

86 if pad_left < 0 or pad_right < 0 or pad_top < 0 or pad_bottom < 0:

87 raise ValueError("padding values must be >= 0")

89 # Validate input

90 if input.dim() < 3:

91 raise ValueError("input must have at least 3 dimensions")

92 if input.device.type != flag_gems.device:

93 raise ValueError(f"input must be a {flag_gems.device} tensor")

95 x = input.contiguous()

96 H_in = int(x.shape[-2])

97 W_in = int(x.shape[-1])

98 # Validate reflection padding constraints

99 if H_in < 2 or W_in < 2:

100 raise ValueError(

101 "input spatial dimensions must be at least 2 for reflection padding when padding > 0"

102 )

103 if H_in <= 0 or W_in <= 0:

104 raise ValueError("spatial dimensions must be > 0")

105 if pad_left >= W_in or pad_right >= W_in or pad_top >= H_in or pad_bottom >= H_in:

106 raise ValueError(

107 "padding values must be less than the input spatial dimensions for reflection padding"

108 )

109

110 H_out = H_in + pad_top + pad_bottom

111 W_out = W_in + pad_left + pad_right

112

113 leading_shape = x.shape[:-2]

114 B = int(math.prod(leading_shape)) if len(leading_shape) > 0 else 1

115

116 # Handle output tensor

117 if out is None:

118 out = torch.empty(

119 (*leading_shape, H_out, W_out), device=x.device, dtype=x.dtype

120 )

121 else:

122 if out.device.type != flag_gems.device:

123 raise ValueError(f"out must be a {flag_gems.device} tensor")

124 expected_shape = (*leading_shape, H_out, W_out)

125 if tuple(out.shape) != expected_shape:

126 raise ValueError(

127 f"out tensor has shape {tuple(out.shape)}, expected {expected_shape}"

128 )

129 if out.dtype != x.dtype:

130 raise ValueError(

131 f"out dtype {out.dtype} does not match input dtype {x.dtype}"

132 )

133 if out.device != x.device:

134 raise ValueError("out must be on the same device as input")

135 out = out.contiguous()

136

137 # No padding: just copy

138 if pad_left == 0 and pad_right == 0 and pad_top == 0 and pad_bottom == 0:

139 BLOCK_HW = 256

140 grid = (B, triton.cdiv(H_in * W_in, BLOCK_HW))

141 copy_tensor_kernel[grid](x, out, B, H_in, W_in, BLOCK_HW=BLOCK_HW)

142 return out

143

144 BLOCK_HW = 256

145 grid = (B, triton.cdiv(H_out * W_out, BLOCK_HW))

146 reflection_pad2d_kernel[grid](

147 x, out, B, H_in, W_in, pad_left, pad_top, H_out, W_out, BLOCK_HW=BLOCK_HW

148 )

149 return out

150

151

152def reflection_pad2d(input: torch.Tensor, padding):

153 logger.debug("GEMS REFLECTION_PAD2D")

154 return launch_reflection_pad2d(input, padding, out=None)

155

156

157def reflection_pad2d_out(input: torch.Tensor, padding, out: torch.Tensor):

158 logger.debug("GEMS REFLECTION_PAD2D_OUT")

159 return launch_reflection_pad2d(input, padding, out=out)

Coverage for src/flag_gems/ops/reflection_pad2d.py: 55%

93 statements