Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DotLayer heuristical dim matching causes problems #1154

Closed
albertz opened this issue Oct 19, 2022 · 1 comment · Fixed by #1155
Closed

DotLayer heuristical dim matching causes problems #1154

albertz opened this issue Oct 19, 2022 · 1 comment · Fixed by #1155
Labels
potential-new-behavior Discussions about RETURNN behaviour

Comments

@albertz
Copy link
Member

albertz commented Oct 19, 2022

Specifically, these tests fail:

def test_DotLayer_dim_wrong_matching_same_dim_value():
  from returnn.tf.util.data import batch_dim, SpatialDim, FeatureDim
  time_dim = SpatialDim("time")
  feat_dim = FeatureDim("feat", dimension=5)
  feat2_dim = FeatureDim("other-feat", dimension=5)

  # First directly check DotLayer.get_out_data_from_opts.
  # This is more similar like we have it in returnn_common
  # and might trigger different errors due to the dim matching logic of DotLayer,
  # which behaves slightly different when there are no size_placeholders set yet,
  # see Dim.is_equal with unknown_spatial_matches.
  a = Data("a", dim_tags=[batch_dim, time_dim, feat_dim])
  b = Data("b", dim_tags=[batch_dim, time_dim, feat2_dim])
  net = TFNetwork(config=Config(), extern_data=ExternData())
  out = DotLayer.get_out_data_from_opts(
    name="dot",
    sources=[InternalLayer(name="a", network=net, output=a), InternalLayer(name="b", network=net, output=b)],
    reduce=time_dim)
  assert out.dim_tags == (batch_dim, feat_dim, feat2_dim)

  # Now full config.
  config = Config({
    "extern_data": {
      "a": {"dim_tags": [batch_dim, time_dim, feat_dim]},
      "b": {"dim_tags": [batch_dim, time_dim, feat2_dim]},
    },
    "network": {
      "output": {"class": "dot", "from": ["data:a", "data:b"], "reduce": time_dim},
    },
    "debug_runtime_sanity_checks": True,
  })
  with make_scope() as session:
    net = TFNetwork(config=config)
    net.construct_from_dict(config.typed_dict["network"])
    layer = net.get_default_output_layer()
    assert layer.output.dim_tags == (batch_dim, feat_dim, feat2_dim)
    feed_dict = make_feed_dict(net.extern_data)
    session.run(layer.output.placeholder, feed_dict=feed_dict)


def test_DotLayer_dim_wrong_matching_derived():
  from returnn.tf.util.data import batch_dim, SpatialDim, FeatureDim
  time_dim = SpatialDim("time")
  time_dim_2 = time_dim * 2
  assert time_dim_2.derived_from_tag == time_dim
  assert time_dim_2.get_same_derived_base() == time_dim
  feat_dim = FeatureDim("feat", dimension=5)

  # First directly check DotLayer.get_out_data_from_opts.
  # This is more similar like we have it in returnn_common
  # and might trigger different errors due to the dim matching logic of DotLayer,
  # which behaves slightly different when there are no size_placeholders set yet,
  # see Dim.is_equal with unknown_spatial_matches.
  a = Data("a", dim_tags=[batch_dim, time_dim, feat_dim])
  b = Data("b", dim_tags=[batch_dim, time_dim_2, feat_dim])
  net = TFNetwork(config=Config(), extern_data=ExternData())
  out = DotLayer.get_out_data_from_opts(
    name="dot",
    sources=[InternalLayer(name="a", network=net, output=a), InternalLayer(name="b", network=net, output=b)],
    reduce=feat_dim)
  assert out.dim_tags == (batch_dim, time_dim, time_dim_2)

  # Now full config.
  config = Config({
    "extern_data": {
      "a": {"dim_tags": [batch_dim, time_dim, feat_dim]},
      "b": {"dim_tags": [batch_dim, time_dim_2, feat_dim]},
    },
    "network": {
      "output": {"class": "dot", "from": ["data:a", "data:b"], "reduce": feat_dim},
    },
    "debug_runtime_sanity_checks": True,
  })
  with make_scope() as session:
    net = TFNetwork(config=config)
    net.construct_from_dict(config.typed_dict["network"])
    layer = net.get_default_output_layer()
    assert layer.output.dim_tags == (batch_dim, time_dim, time_dim_2)
    feed_dict = make_feed_dict(net.extern_data)
    session.run(layer.output.placeholder, feed_dict=feed_dict)

The first test fails with:

Traceback (most recent call last):
  File "/Users/az/i6/setups/2022-03-19--sis-i6-exp/ext/returnn/tests/test_TFNetworkLayer.py", line 7166, in test_DotLayer_dim_wrong_matching_same_dim_value
    line: assert out.dim_tags == (batch_dim, feat_dim, feat2_dim)
    locals:
      out = <local> Data{'dot_output', [B?,F|F'feat'(5)]}
      out.dim_tags = <local> (Dim{B}, Dim{F'feat'(5)})
      batch_dim = <local> Dim{B}
      feat_dim = <local> Dim{F'feat'(5)}
      feat2_dim = <local> Dim{F'other-feat'(5)}
AssertionError: assert (Dim{B}, Dim{F'feat'(5)}) == (Dim{B}, Dim{F'feat'(5)}, Dim{F'other-feat'(5)})
  Right contains one more item: Dim{F'other-feat'(5)}
  Full diff:
  - (Dim{B}, Dim{F'feat'(5)}, Dim{F'other-feat'(5)})
  + (Dim{B}, Dim{F'feat'(5)})

The second test fails with:

Traceback (most recent call last):
  File "/Users/az/i6/setups/2022-03-19--sis-i6-exp/ext/returnn/tests/test_TFNetworkLayer.py", line 7208, in test_DotLayer_dim_wrong_matching_derived
    line: assert out.dim_tags == (batch_dim, time_dim, time_dim_2)
    locals:
      out = <local> Data{'dot_output', [B?,T|'time'[?]]}
      out.dim_tags = <local> (Dim{B}, Dim{'time'[?]})
      batch_dim = <local> Dim{B}
      time_dim = <local> Dim{'time'[?]}
      time_dim_2 = <local> Dim{'time*2'[?]}
AssertionError: assert (Dim{B}, Dim{'time'[?]}) == (Dim{B}, Dim{'time'[?]}, Dim{'time*2'[?]})
  Right contains one more item: Dim{'time*2'[?]}
  Full diff:
  - (Dim{B}, Dim{'time'[?]}, Dim{'time*2'[?]})
  + (Dim{B}, Dim{'time'[?]})

(Actually it already fails with the time_dim_2.derived_from_tag == time_dim but this is an independent fix.)

The problematic code is in DotLayer._auto_var_axes:

    is_equal_opts = dict(
      treat_feature_as_spatial=True, allow_same_spatial_dim=True,
      undefined_matches=True, derived_matches=True)

I think we have derived_matches mostly for RecLayer and its subnet template construction in some cases. But I don't really remember exactly anymore which cases. The allow_same_spatial_dim is for all networks where dim tags are not consistently used everywhere.

Note that this logic is only relevant when the user does not explicitly specify var1 and var2, which is maybe not so common.

Anyway, a change of this behavior probably requires a new behavior version.

@albertz
Copy link
Member Author

albertz commented Oct 24, 2022

I just realize, I think this problem was already mentioned in #865.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
potential-new-behavior Discussions about RETURNN behaviour
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant