redhatcloudx · miyunari · Jul 25, 2023 · Jul 21, 2023 · Jul 23, 2023 · Jul 24, 2023
diff --git a/src/cloudimagedirectory/connection/connection.py b/src/cloudimagedirectory/connection/connection.py
@@ -31,6 +31,25 @@ def is_provided_by(self, name: str) -> bool:
         """Check the origin of the file."""
         return f"{name}/" in self.filename
 
+    def is_API(self, api: str) -> bool:
+        """Check if the file is the actual API entry and not a sub url."""
+        path = self.filename.split("/")
+        if path[0] != api:
+            return False
+
+        if path[0] == "v1":
+            return True
+
+        slash_count = self.filename.count("/")
+        if slash_count != 10:
+            return False
+
+        # NOTE: check length of hash value.
+        if len(path[len(path) - 1]) != 40:
+            return False
+
+        return True
+
 
 class ConnectionFS:
     """Handles the connection to the filesystem."""

diff --git a/src/cloudimagedirectory/transform/transform.py b/src/cloudimagedirectory/transform/transform.py
@@ -1,5 +1,6 @@
 """Transforms the raw data into useful data."""
 import copy
+import hashlib
 import os
 from datetime import datetime
 from typing import Any, Callable, no_type_check
@@ -84,7 +85,6 @@ class TransformerIdxListImageLatest(Transformer):
     # TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later.
     @no_type_check
     def run(self, data: Transformer) -> list:  # noqa: C901
-        """Sort the raw data."""
         # NOTE: Verify that the data is not raw.
         entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")]
 
@@ -281,7 +281,6 @@ class TransformerIdxListImageNames(Transformer):
     # TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later.
     @no_type_check
     def run(self, data: type[Transformer]) -> list:
-        """Sort the raw data."""
         # NOTE: Verify that the data is not raw.
         entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")]
 
@@ -295,27 +294,143 @@ def run(self, data: type[Transformer]) -> list:
         return [connection.DataEntry("v1/idx/list/image-names", results)]
 
 
+class TransformerAWSV2RHEL(Transformer):
+    """Transform raw rhel AWS data into the schema."""
+
+    def run(self, data: list) -> list:
+        """Transform the raw data."""
+        # NOTE: Verify that the data is raw.
+        entries = [x for x in data if x.is_provided_by("aws") and x.is_raw()]
+
+        results = []
+        for e in entries:
+            entry = copy.deepcopy(e)
+
+            raw = self.src_conn.get_content(entry)
+            region = os.path.basename(raw.filename).split(".")[0]
+
+            for content in raw.content:
+                if content["OwnerId"] != config.AWS_RHEL_OWNER_ID:
+                    continue
+
+                image_data = format_aws.image_rhel(content, region)
+                image_name = image_data["name"].replace(" ", "_").lower()
+                os_name = "rhel"
+                provider = "aws"
+                version = image_data["version"]
+                # NOTE: Due to consistency issues between the cloud providers and the fact
+                # that they do not all have unique numbers to identify their images, we decided
+                # to use this solution instead.
+                image_id = hashlib.sha1(image_name.encode()).hexdigest()  # noqa: S324
+
+                # NOTE: example of expected paths
+                # v2/os/rhel/provider/aws/version/8.6.0/region/eu-west-3/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4
+                # v2/os/rhel/provider/aws/version/8.2.0/region/eu-north-1/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9
+                path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}"
+                data_entry = connection.DataEntry(path, image_data)
+
+                results.append(data_entry)
+        return results
+
+
+class TransformerAzureV2RHEL(Transformer):
+    """Transform raw rhel Azure data into the schema."""
+
+    def run(self, data: list) -> list:
+        """Transform the raw data."""
+        # NOTE: Verify that the data is raw and provided by azure.
+        entries = [x for x in data if x.is_provided_by("azure") and x.is_raw()]
+
+        results = []
+        for e in entries:
+            entry = copy.deepcopy(e)
+            raw = self.src_conn.get_content(entry)
+
+            for content in raw.content:
+                if content["publisher"] != "RedHat":
+                    continue
+
+                content["hyperVGeneration"] = "unknown"
+
+                image_data = format_azure.image_rhel(content)
+                image_name = image_data["name"].replace(" ", "_").lower()
+                os_name = "rhel"
+                provider = "azure"
+                region = "global"
+                version = image_data["version"]
+                # NOTE: Due to consistency issues between the cloud providers and the fact
+                # that they do not all have unique numbers to identify their images, we decided
+                # to use this solution instead.
+                image_id = hashlib.sha1(image_name.encode()).hexdigest()  # noqa: S324
+
+                # NOTE: example of expected paths
+                # v2/os/rhel/provider/azure/version/8.6.0/region/southcentralus/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4
+                # v2/os/rhel/provider/azure/version/8.2.0/region/southcentralus/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9
+                path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}"
+                data_entry = connection.DataEntry(path, image_data)
+
+                results.append(data_entry)
+        return results
+
+
+class TransformerGoogleV2RHEL(Transformer):
+    """Transform raw rhel Google data into the schema."""
+
+    def run(self, data: list) -> list:
+        """Transform the raw data."""
+        # NOTE: Verify that the data is raw and provided by google.
+        entries = [x for x in data if x.is_provided_by("google") and x.is_raw()]
+
+        results = []
+        for e in entries:
+            entry = copy.deepcopy(e)
+            raw = self.src_conn.get_content(entry)
+
+            for content in raw.content:
+                content["creation_timestamp"] = content["creationTimestamp"]
+                if "rhel" in content["name"]:
+                    image_data = format_google.image_rhel(content)
+                    image_name = image_data["name"].replace(" ", "_").lower()
+                    region = "global"
+                    os_name = "rhel"
+                    provider = "google"
+                    version = image_data["version"]
+                    # NOTE: Due to consistency issues between the cloud providers and the fact
+                    # that they do not all have unique numbers to identify their images, we decided
+                    # to use this solution instead.
+                    image_id = hashlib.sha1(image_name.encode()).hexdigest()  # noqa: S324
+
+                    # NOTE: example of expected paths
+                    # v2/os/rhel/provider/google/version/8.6.0/region/global/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4
+                    # v2/os/rhel/provider/google/version/8.2.0/region/global/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9
+                    path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}"
+                    data_entry = connection.DataEntry(path, image_data)
+
+                    results.append(data_entry)
+        return results
+
+
 class TransformerV2All(Transformer):
     """Genearate list of all image details."""
 
     # TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later.
     @no_type_check
     def run(self, data: type[Transformer]) -> list:
-        """Sort the raw data."""
-        # NOTE: Verify that the data is not raw.
-        entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")]
+        # NOTE: Verify that the data is from api v2.
+        entries = [x for x in data if x.is_API("v2")]
 
         results = []
 
         for e in entries:
             entry = copy.deepcopy(e)
+
             filename = entry.filename.split("/")
             if len(filename) < 3:
                 print("warn: could not determine region or provider of image: " + entry.filename)
                 continue
 
-            entry.content["provider"] = filename[1]
-            entry.content["region"] = filename[2]
+            entry.content["provider"] = filename[4]
+            entry.content["region"] = filename[8]
             results.append(entry.content)
 
         results.sort(key=lambda x: x["name"], reverse=False)
@@ -339,20 +454,18 @@ def display_name(self) -> dict:
     # TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later.
     @no_type_check
     def run(self, data: type[Transformer]) -> list:
-        """Sort the raw data."""
-        # NOTE: Verify that the data is not raw.
-        entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")]
+        # NOTE: Verify that the data is from api v2.
+        entries = [x for x in data if x.is_API("v2")]
 
         results = []
         os_list = {}
 
         for e in entries:
             entry = copy.deepcopy(e)
+            filename = entry.filename.split("/")[10]
 
             try:
-                filename = entry.filename.split("/")[3]
-                print(entry.filename)
-                os = filename.split("_")[0]
+                os = entry.filename.split("/")[2]
 
                 if os not in os_list:
                     os_list[os] = 1
@@ -361,27 +474,7 @@ def run(self, data: type[Transformer]) -> list:
             except IndexError:
                 print(f"Could not format image, filename: {filename}")
 
-        rhel_products = {
-            "rh-ocp-worker",
-            "rh-oke-worker",
-            "rh-opp-worker",
-            "rh-rhel",
-            "rhel-arm64",
-            "rhel-byos",
-            "rhel-raw",
-            "rhel-sap-apps",
-            "rhel-sap-ha",
-            "rh",
-        }
-
-        os_list_final: dict[Any, Any] = {}
-        for os, val in list(os_list.items()):
-            key = os
-            if os in rhel_products:
-                key = "rhel"
-            os_list_final[key] = os_list_final.get(key, 0) + val
-
-        for os, val in os_list_final.items():
+        for os, val in os_list.items():
             desc = self.description.get(os, "no description")
             disp_name = self.display_name.get(os, "no display name")
 
@@ -394,4 +487,5 @@ def run(self, data: type[Transformer]) -> list:
 
             results.append(entry_object)
 
+        # NOTE: Add /list suffix to prevent collision with "os" folder.
         return [connection.DataEntry("v2/os/list", results)]
diff --git a/src/cloudimagedirectory/transformer.py b/src/cloudimagedirectory/transformer.py
@@ -53,6 +53,7 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s
     filters = [
         filter.FilterImageByFilename("test"),
         filter.FilterImageByFilename("beta"),
+        filter.FilterImageByFilename("raw"),
         filter.FilterImageByUniqueName(),
     ]
 
@@ -64,7 +65,7 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s
         filter_after = pd.to_datetime(filter_until)
         filters.append(filter.FilterImageByLatestUpdate(filter_after))
 
-    pipeline = transform.Pipeline(
+    pipeline_v1 = transform.Pipeline(
         origin_connection,
         [
             transform.TransformerAWS,
@@ -78,12 +79,29 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s
             transform.TransformerIdxListImageLatestGoogle,
             transform.TransformerIdxListImageLatestAWS,
             transform.TransformerIdxListImageLatestAZURE,
+        ],
+    )
+    print("run pipeline v1")
+    results = pipeline_v1.run(filenames)
+
+    # NOTE: Introducing a second pipeline, to avoid filtering of v1/v2 data
+    # based on the image filename.
+    # We do not adapt the filter, since v1 will be removed soon.
+    pipeline_v2 = transform.Pipeline(
+        origin_connection,
+        [
+            transform.TransformerAWSV2RHEL,
+            transform.TransformerAzureV2RHEL,
+            transform.TransformerGoogleV2RHEL,
+        ],
+        filters,
+        [
             transform.TransformerV2All,
             transform.TransformerV2ListOS,
         ],
     )
-    print("run pipeline")
-    results = pipeline.run(filenames)
+    print("run pipeline v2")
+    results.extend(pipeline_v2.run(filenames))
 
     for result in results:
         result.filename = destination_path + "/" + result.filename

diff --git a/tests/transformer/test_aws_rhel.py b/tests/transformer/test_aws_rhel.py
@@ -0,0 +1,33 @@
+"""Tests for the v2 AWS RHEL transformer."""
+import filecmp
+import os
+
+from cloudimagedirectory import transformer
+
+
+def test_aws_v2_rhel_transformer_command(runner, tmp_path):
+    """Verify that we can transform AWS data for RHEL."""
+    result = runner.invoke(
+        transformer.run,
+        [
+            "-f",
+            "tests/transformer/testdata/input/raw/aws/af-south-1.json",
+            "-op=.",
+            f"-dp={tmp_path}",
+            "--filter.until=none",
+        ],
+    )
+
+    assert result.exit_code == 0, f"expected no error, but got code {result.exit_code} and output:\n{result.output}"
+
+    # Ensure the directory was made.
+    assert os.path.isdir(f"{tmp_path}/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image")
+
+    # Get current directory
+    pwd = os.getcwd()
+
+    # Check image data by comparing the expected file and the output file byte by byte.
+    assert filecmp.cmp(
+        f"{pwd}/tests/transformer/testdata/expected/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f",
+        f"{tmp_path}/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f",
+    )
diff --git a/tests/transformer/test_azure_rhel.py b/tests/transformer/test_azure_rhel.py
@@ -0,0 +1,33 @@
+"""Tests for the v2 Azure RHEL transformer."""
+import filecmp
+import os
+
+from cloudimagedirectory import transformer
+
+
+def test_aws_v2_rhel_transformer_command(runner, tmp_path):
+    """Verify that we can transform Azure data for RHEL."""
+    result = runner.invoke(
+        transformer.run,
+        [
+            "-f",
+            "tests/transformer/testdata/input/raw/azure/eastus.json",
+            "-op=.",
+            f"-dp={tmp_path}",
+            "--filter.until=none",
+        ],
+    )
+
+    assert result.exit_code == 0, f"expected no error, but got code {result.exit_code} and output:\n{result.output}"
+
+    # Ensure the directory was made.
+    assert os.path.isdir(f"{tmp_path}/v2/os/rhel/provider/azure/version/311.161/region/global/image")
+
+    # Get current directory
+    pwd = os.getcwd()
+
+    # Check image data by comparing the expected file and the output file byte by byte.
+    assert filecmp.cmp(
+        f"{pwd}/tests/transformer/testdata/expected/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c",
+        f"{tmp_path}/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c",
+    )