Skip to content
This repository has been archived by the owner on Aug 21, 2024. It is now read-only.

V2 create basic structure #671 #682

Merged
merged 8 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/cloudimagedirectory/connection/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,25 @@ def is_provided_by(self, name: str) -> bool:
"""Check the origin of the file."""
return f"{name}/" in self.filename

def is_API(self, api: str) -> bool:
"""Check if the file is the actual API entry and not a sub url."""
path = self.filename.split("/")
if path[0] != api:
return False

if path[0] == "v1":
return True

slash_count = self.filename.count("/")
if slash_count != 10:
return False

# NOTE: check length of hash value.
if len(path[len(path) - 1]) != 40:
return False

return True


class ConnectionFS:
"""Handles the connection to the filesystem."""
Expand Down
162 changes: 128 additions & 34 deletions src/cloudimagedirectory/transform/transform.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Transforms the raw data into useful data."""
import copy
import hashlib
import os
from datetime import datetime
from typing import Any, Callable, no_type_check
Expand Down Expand Up @@ -84,7 +85,6 @@ class TransformerIdxListImageLatest(Transformer):
# TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later.
@no_type_check
def run(self, data: Transformer) -> list: # noqa: C901
"""Sort the raw data."""
# NOTE: Verify that the data is not raw.
entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")]

Expand Down Expand Up @@ -281,7 +281,6 @@ class TransformerIdxListImageNames(Transformer):
# TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later.
@no_type_check
def run(self, data: type[Transformer]) -> list:
"""Sort the raw data."""
# NOTE: Verify that the data is not raw.
entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")]

Expand All @@ -295,27 +294,143 @@ def run(self, data: type[Transformer]) -> list:
return [connection.DataEntry("v1/idx/list/image-names", results)]


class TransformerAWSV2RHEL(Transformer):
"""Transform raw rhel AWS data into the schema."""

def run(self, data: list) -> list:
"""Transform the raw data."""
# NOTE: Verify that the data is raw.
entries = [x for x in data if x.is_provided_by("aws") and x.is_raw()]

results = []
for e in entries:
entry = copy.deepcopy(e)

raw = self.src_conn.get_content(entry)
region = os.path.basename(raw.filename).split(".")[0]

for content in raw.content:
if content["OwnerId"] != config.AWS_RHEL_OWNER_ID:
continue

image_data = format_aws.image_rhel(content, region)
image_name = image_data["name"].replace(" ", "_").lower()
os_name = "rhel"
provider = "aws"
version = image_data["version"]
# NOTE: Due to consistency issues between the cloud providers and the fact
# that they do not all have unique numbers to identify their images, we decided
# to use this solution instead.
image_id = hashlib.sha1(image_name.encode()).hexdigest() # noqa: S324

# NOTE: example of expected paths
# v2/os/rhel/provider/aws/version/8.6.0/region/eu-west-3/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4
# v2/os/rhel/provider/aws/version/8.2.0/region/eu-north-1/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9
path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}"
data_entry = connection.DataEntry(path, image_data)

results.append(data_entry)
return results


class TransformerAzureV2RHEL(Transformer):
"""Transform raw rhel Azure data into the schema."""

def run(self, data: list) -> list:
"""Transform the raw data."""
# NOTE: Verify that the data is raw and provided by azure.
entries = [x for x in data if x.is_provided_by("azure") and x.is_raw()]

results = []
for e in entries:
entry = copy.deepcopy(e)
raw = self.src_conn.get_content(entry)

for content in raw.content:
if content["publisher"] != "RedHat":
continue

content["hyperVGeneration"] = "unknown"

image_data = format_azure.image_rhel(content)
image_name = image_data["name"].replace(" ", "_").lower()
os_name = "rhel"
provider = "azure"
region = "global"
version = image_data["version"]
# NOTE: Due to consistency issues between the cloud providers and the fact
# that they do not all have unique numbers to identify their images, we decided
# to use this solution instead.
image_id = hashlib.sha1(image_name.encode()).hexdigest() # noqa: S324

# NOTE: example of expected paths
# v2/os/rhel/provider/azure/version/8.6.0/region/southcentralus/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4
# v2/os/rhel/provider/azure/version/8.2.0/region/southcentralus/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9
path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}"
data_entry = connection.DataEntry(path, image_data)

results.append(data_entry)
return results


class TransformerGoogleV2RHEL(Transformer):
"""Transform raw rhel Google data into the schema."""

def run(self, data: list) -> list:
"""Transform the raw data."""
# NOTE: Verify that the data is raw and provided by google.
entries = [x for x in data if x.is_provided_by("google") and x.is_raw()]

results = []
for e in entries:
entry = copy.deepcopy(e)
raw = self.src_conn.get_content(entry)

for content in raw.content:
content["creation_timestamp"] = content["creationTimestamp"]
if "rhel" in content["name"]:
image_data = format_google.image_rhel(content)
image_name = image_data["name"].replace(" ", "_").lower()
region = "global"
os_name = "rhel"
provider = "google"
version = image_data["version"]
# NOTE: Due to consistency issues between the cloud providers and the fact
# that they do not all have unique numbers to identify their images, we decided
miyunari marked this conversation as resolved.
Show resolved Hide resolved
# to use this solution instead.
image_id = hashlib.sha1(image_name.encode()).hexdigest() # noqa: S324

# NOTE: example of expected paths
# v2/os/rhel/provider/google/version/8.6.0/region/global/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4
# v2/os/rhel/provider/google/version/8.2.0/region/global/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9
path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}"
data_entry = connection.DataEntry(path, image_data)

results.append(data_entry)
return results


class TransformerV2All(Transformer):
"""Genearate list of all image details."""

# TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later.
@no_type_check
def run(self, data: type[Transformer]) -> list:
"""Sort the raw data."""
# NOTE: Verify that the data is not raw.
entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")]
# NOTE: Verify that the data is from api v2.
entries = [x for x in data if x.is_API("v2")]

results = []

for e in entries:
entry = copy.deepcopy(e)

filename = entry.filename.split("/")
if len(filename) < 3:
print("warn: could not determine region or provider of image: " + entry.filename)
continue

entry.content["provider"] = filename[1]
entry.content["region"] = filename[2]
entry.content["provider"] = filename[4]
Copy link
Member

@F-X64 F-X64 Jul 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is totally fine but I can imagine that these array access values might be confusing in the long run. Maybe we can use namedtuples in the future? It would at least make accessing the individual fields easier.

Just an example:

  from collections import namedtuple
  
  APIPath = namedtuple('APIPath', ['api_version', 'distro', 'provider', 'version','region','image_id'])
  
  split_path = "v2/rhel/google/9.0.0/us-east-1/asdasdasdadaf".split('/')
  api_path = APIPath(*split_path)
  
  print(api_path.api_version)
  print(api_path.image_id)

That api_path could be a property of the DataEntry object or something.

Copy link
Member Author

@miyunari miyunari Jul 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, that looks way better to maintain. 😄 Is it ok, when I adapt this in another pr? :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created a new issue for it :) #685

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolutely, thanks for taking care of this.

entry.content["region"] = filename[8]
results.append(entry.content)

results.sort(key=lambda x: x["name"], reverse=False)
Expand All @@ -339,20 +454,18 @@ def display_name(self) -> dict:
# TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later.
@no_type_check
def run(self, data: type[Transformer]) -> list:
"""Sort the raw data."""
# NOTE: Verify that the data is not raw.
entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")]
# NOTE: Verify that the data is from api v2.
entries = [x for x in data if x.is_API("v2")]

results = []
os_list = {}

for e in entries:
entry = copy.deepcopy(e)
filename = entry.filename.split("/")[10]

try:
filename = entry.filename.split("/")[3]
print(entry.filename)
os = filename.split("_")[0]
os = entry.filename.split("/")[2]

if os not in os_list:
os_list[os] = 1
Expand All @@ -361,27 +474,7 @@ def run(self, data: type[Transformer]) -> list:
except IndexError:
print(f"Could not format image, filename: {filename}")

rhel_products = {
"rh-ocp-worker",
"rh-oke-worker",
"rh-opp-worker",
"rh-rhel",
"rhel-arm64",
"rhel-byos",
"rhel-raw",
"rhel-sap-apps",
"rhel-sap-ha",
"rh",
}

os_list_final: dict[Any, Any] = {}
for os, val in list(os_list.items()):
key = os
if os in rhel_products:
key = "rhel"
os_list_final[key] = os_list_final.get(key, 0) + val

for os, val in os_list_final.items():
for os, val in os_list.items():
desc = self.description.get(os, "no description")
disp_name = self.display_name.get(os, "no display name")

Expand All @@ -394,4 +487,5 @@ def run(self, data: type[Transformer]) -> list:

results.append(entry_object)

# NOTE: Add /list suffix to prevent collision with "os" folder.
return [connection.DataEntry("v2/os/list", results)]
24 changes: 21 additions & 3 deletions src/cloudimagedirectory/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s
filters = [
filter.FilterImageByFilename("test"),
filter.FilterImageByFilename("beta"),
filter.FilterImageByFilename("raw"),
filter.FilterImageByUniqueName(),
]

Expand All @@ -64,7 +65,7 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s
filter_after = pd.to_datetime(filter_until)
filters.append(filter.FilterImageByLatestUpdate(filter_after))

pipeline = transform.Pipeline(
pipeline_v1 = transform.Pipeline(
origin_connection,
[
transform.TransformerAWS,
Expand All @@ -78,12 +79,29 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s
transform.TransformerIdxListImageLatestGoogle,
transform.TransformerIdxListImageLatestAWS,
transform.TransformerIdxListImageLatestAZURE,
],
)
print("run pipeline v1")
results = pipeline_v1.run(filenames)

# NOTE: Introducing a second pipeline, to avoid filtering of v1/v2 data
# based on the image filename.
# We do not adapt the filter, since v1 will be removed soon.
pipeline_v2 = transform.Pipeline(
origin_connection,
[
transform.TransformerAWSV2RHEL,
transform.TransformerAzureV2RHEL,
transform.TransformerGoogleV2RHEL,
],
filters,
[
transform.TransformerV2All,
transform.TransformerV2ListOS,
],
)
print("run pipeline")
results = pipeline.run(filenames)
print("run pipeline v2")
results.extend(pipeline_v2.run(filenames))

for result in results:
result.filename = destination_path + "/" + result.filename
Expand Down
33 changes: 33 additions & 0 deletions tests/transformer/test_aws_rhel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Tests for the v2 AWS RHEL transformer."""
import filecmp
import os

from cloudimagedirectory import transformer


def test_aws_v2_rhel_transformer_command(runner, tmp_path):
"""Verify that we can transform AWS data for RHEL."""
result = runner.invoke(
transformer.run,
[
"-f",
"tests/transformer/testdata/input/raw/aws/af-south-1.json",
"-op=.",
f"-dp={tmp_path}",
"--filter.until=none",
],
)

assert result.exit_code == 0, f"expected no error, but got code {result.exit_code} and output:\n{result.output}"

# Ensure the directory was made.
assert os.path.isdir(f"{tmp_path}/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image")

# Get current directory
pwd = os.getcwd()

# Check image data by comparing the expected file and the output file byte by byte.
assert filecmp.cmp(
f"{pwd}/tests/transformer/testdata/expected/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f",
f"{tmp_path}/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f",
)
33 changes: 33 additions & 0 deletions tests/transformer/test_azure_rhel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Tests for the v2 Azure RHEL transformer."""
import filecmp
import os

from cloudimagedirectory import transformer


def test_aws_v2_rhel_transformer_command(runner, tmp_path):
"""Verify that we can transform Azure data for RHEL."""
result = runner.invoke(
transformer.run,
[
"-f",
"tests/transformer/testdata/input/raw/azure/eastus.json",
"-op=.",
f"-dp={tmp_path}",
"--filter.until=none",
],
)

assert result.exit_code == 0, f"expected no error, but got code {result.exit_code} and output:\n{result.output}"

# Ensure the directory was made.
assert os.path.isdir(f"{tmp_path}/v2/os/rhel/provider/azure/version/311.161/region/global/image")

# Get current directory
pwd = os.getcwd()

# Check image data by comparing the expected file and the output file byte by byte.
assert filecmp.cmp(
f"{pwd}/tests/transformer/testdata/expected/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c",
f"{tmp_path}/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c",
)
Loading