Another tweak to intake metadata problems (#3595)

* Another tweak to intake metadata problems Make sure we can't end up with undefined `metadata`. Record details of `metadata.log` access to `run.controller` without adding a ton of separate messages.
distributed-system-analysis · Jan 9, 2024 · 3db0cf9 · 3db0cf9
1 parent 5d9ce4d
commit 3db0cf9
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 27 deletions.
diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py
@@ -834,24 +834,22 @@ def get_inventory(self, path: str) -> dict[str, Any]:
  return {"name": path, "type": type, "stream": stream}
 
  @staticmethod
- def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
+ def _get_metadata(tarball_path: Path) -> JSONOBJECT:
  """Fetch the values in metadata.log from the tarball.
 
+ Args:
+ tarball_path: a file path to a tarball
+
  Returns:
- A JSON representation of the dataset `metadata.log` or None if the
- tarball has no metadata.log.
+ A JSON representation of the dataset `metadata.log`
  """
  name = Dataset.stem(tarball_path)
- try:
- data = Tarball.extract(tarball_path, f"{name}/metadata.log")
- except CacheExtractBadPath:
- return None
- else:
- metadata_log = MetadataLog()
- metadata_log.read_file(e.decode() for e in data)
- data.close()
- metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()}
- return metadata
+ data = Tarball.extract(tarball_path, f"{name}/metadata.log")
+ metadata_log = MetadataLog()
+ metadata_log.read_file(e.decode() for e in data)
+ data.close()
+ metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()}
+ return metadata
 
  @staticmethod
  def subprocess_run(
@@ -1442,18 +1440,28 @@ def create(self, tarfile_path: Path) -> Tarball:
  raise BadFilename(tarfile_path)
  name = Dataset.stem(tarfile_path)
  controller_name = None
+ errwhy = "unknown"
  try:
  metadata = Tarball._get_metadata(tarfile_path)
- controller_name = metadata["run"]["controller"]
- except Exception as exc:
- self.logger.warning(
- "{} metadata.log is missing run.controller: {!r}", name, exc
- )
+ except Exception as e:
+ metadata = None
+ errwhy = str(e)
+ else:
+ run = metadata.get("run")
+ if run:
+ controller_name = run.get("controller")
+ if not controller_name:
+ errwhy = "missing 'controller' in 'run' section"
+ else:
+ errwhy = "missing 'run' section"
 
  if not controller_name:
  controller_name = "unknown"
  self.logger.warning(
- "{} has no controller name, assuming {!r}", name, controller_name
+ "{} has no controller name ({}), assuming {!r}",
+ name,
+ errwhy,
+ controller_name,
  )
 
  if name in self.tarballs:

diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py
@@ -249,8 +249,8 @@ def test_create_bad(
  # The create call will remove the source files, so trying again should
  # result in an error.
  with pytest.raises(BadFilename) as exc:
- cm.create(source_md5)
- assert exc.value.path == str(source_md5)
+ cm.create(source_tarball)
+ assert exc.value.path == str(source_tarball)
 
  # Attempting to create the same dataset again (from the archive copy)
  # should fail with a duplicate dataset error.
@@ -262,6 +262,29 @@ def test_create_bad(
  assert tarball.metadata == fake_get_metadata(tarball.tarball_path)
  assert exc.value.tarball == tarball.name
 
+ def test_create_null_metalog(
+ self,
+ monkeypatch,
+ db_session,
+ selinux_disabled,
+ server_config,
+ make_logger,
+ tarball,
+ ):
+ """Test create when we can't get metadata.log"""
+
+ def get_throw(_p: Path):
+ raise Exception("Catch!")
+
+ source_tarball, source_md5, md5 = tarball
+ cm = CacheManager(server_config, make_logger)
+ monkeypatch.setattr(Tarball, "_get_metadata", get_throw)
+
+ # Attempting to create a dataset with a metadata error should succeed,
+ # with "unknown" controller
+ tarobj = cm.create(source_tarball)
+ assert tarobj.controller_name == "unknown"
+
  def test_duplicate(
  self,
  monkeypatch,
@@ -1221,12 +1244,11 @@ def fake_extract(t: Path, f: Path):
 
  with monkeypatch.context() as m:
  m.setattr(Tarball, "extract", staticmethod(fake_extract))
- metadata = Tarball._get_metadata(Path(tarball))
-
- if stream:
- assert metadata == {"test": {"foo": "bar"}}
- else:
- assert metadata is None
+ try:
+ metadata = Tarball._get_metadata(Path(tarball))
+ assert stream and metadata == {"test": {"foo": "bar"}}
+ except Exception as e:
+ assert isinstance(e, CacheExtractBadPath) and not stream
 
  def test_inventory_without_subprocess(self):
  """Test the Inventory class when used without a subprocess