tensorflow · sgrigory · Oct 30, 2021
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
@@ -3608,8 +3608,7 @@ def local_attention_2d(q,
  Args:
  q: a Tensor with shape [batch, heads, h, w, depth_k]
  k: a Tensor with shape [batch, heads, h, w, depth_k]
- v: a Tensor with shape [batch, heads, h, w, depth_v]. In the current
- implementation, depth_v must be equal to depth_k.
+ v: a Tensor with shape [batch, heads, h, w, depth_v]
  query_shape: an tuple indicating the height and width of each query block.
  memory_flange: an integer indicating how much to look in height and width
  from each query block.
@@ -3652,9 +3651,12 @@ def local_attention_2d(q,
  dropout_rate=0.,
  name="local_2d",
  make_image_summary=False)
- # Put representations back into original shapes.
+
+ # Form padded output shape: [batch, heads, h_padded, w_padded, depth_v].
  padded_q_shape = common_layers.shape_list(q)
- output = scatter_blocks_2d(output, q_indices, padded_q_shape)
+ output_shape = padded_q_shape[:4] + v_shape[4:]
+ # Put representations back into original shapes.
+ output = scatter_blocks_2d(output, q_indices, output_shape)
 
  # Remove the padding if introduced.
  output = tf.slice(output, [0, 0, 0, 0, 0],
@@ -3935,8 +3937,7 @@ def masked_local_attention_2d(q,
  Args:
  q: a Tensor with shape [batch, heads, h, w, depth_k]
  k: a Tensor with shape [batch, heads, h, w, depth_k]
- v: a Tensor with shape [batch, heads, h, w, depth_v]. In the current
- implementation, depth_v must be equal to depth_k.
+ v: a Tensor with shape [batch, heads, h, w, depth_v]
  query_shape: an tuple indicating the height and width of each query block.
  query_shape = block_shape
  memory_flange: an integer indicating how much to look in height and width
@@ -4000,9 +4001,12 @@ def masked_local_attention_2d(q,
  dropout_rate=0.,
  name="masked_local_2d",
  make_image_summary=False)
- # Put representations back into original shapes.
+
+ # Form padded output shape: [batch, heads, h_padded, w_padded, depth_v].
  padded_q_shape = common_layers.shape_list(q)
- output = scatter_blocks_2d(output, q_indices, padded_q_shape)
+ padded_output_shape = padded_q_shape[:4] + v_shape[4:]
+ # Put representations back into original shapes.
+ output = scatter_blocks_2d(output, q_indices, padded_output_shape)
 
  # Remove the padding if introduced.
  output = tf.slice(output, [0, 0, 0, 0, 0],

diff --git a/tensor2tensor/layers/common_attention_test.py b/tensor2tensor/layers/common_attention_test.py
@@ -517,8 +517,7 @@ def testMaskedLocalAttention1D(self, batch, heads, length, depth_k, depth_v,
  ("", 1, 1, 8, 4, 4, (2, 2)),
  ("dynamic_batch", None, 1, 8, 4, 4, (2, 2)),
  ("batches", 3, 2, 8, 4, 4, (2, 2)),
- # TODO(trandustin): Extend function to enable depth_k != depth_v.
- # ("depth_v", 1, 1, 8, 4, 1, (2, 2)),
+ ("depth_v", 1, 1, 8, 4, 1, (2, 2)),
  ("query_shape", 1, 1, 8, 4, 4, (4, 4)),
  )
  def testMaskedLocalAttention2D(self, batch, heads, length, depth_k, depth_v,
@@ -567,8 +566,7 @@ def testLocalUnmaskedAttention1D(self, batch, heads, length,
  ("matching_block_length", 3, 4, 25, 16, 16, (4, 4)),
  ("unmatching_block_length", 3, 4, 25, 16, 16, (5, 5)),
  ("dynamic_batch", None, 4, 25, 16, 16, (4, 4)),
- # TODO(trandustin): Extend function to enable depth_k != depth_v.
- # ("different_depth_v", 3, 4, 25, 16, 17, (4, 4)),
+ ("different_depth_v", 3, 4, 25, 16, 17, (4, 4)),
  )
  def testLocalUnmaskedAttention2D(self, batch, heads, length,
  depth_k, depth_v, query_shape):