GlobalFishingWatch · rdgfuentes · Jul 5, 2024 · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024
diff --git a/packages/pipe-vms-ingestion/poetry.lock b/packages/pipe-vms-ingestion/poetry.lock
diff --git a/packages/pipe-vms-ingestion/pyproject-build.toml b/packages/pipe-vms-ingestion/pyproject-build.toml
@@ -27,6 +27,8 @@ packages = [
   python = ">=3.9,<3.11"
   apache-beam = {extras = ["gcp"], version = "2.56.0"}
   jinja2 = "^3.0.3"
+  shipdataprocess = "0.8.6"
+  pandas = "^2.2.2"
 
   [tool.poetry.group.dev.dependencies]
   autopep8 = "2.0.2"

diff --git a/packages/pipe-vms-ingestion/pyproject.toml b/packages/pipe-vms-ingestion/pyproject.toml
@@ -24,6 +24,8 @@ readme = 'README.md'
   python = ">=3.9,<3.11"
   apache-beam = {extras = ["gcp"], version = "2.56.0"}
   jinja2 = "^3.0.3"
+  shipdataprocess = "0.8.6"
+  pandas = "^2.2.2"
 
     [tool.poetry.dependencies.bigquery]
     path = "../libs/bigquery"

diff --git a/packages/pipe-vms-ingestion/tests/vms_ingestion/normalization/feeds/test_bra_normalize.py b/packages/pipe-vms-ingestion/tests/vms_ingestion/normalization/feeds/test_bra_normalize.py
@@ -13,7 +13,13 @@
 class TestBRANormalize(unittest.TestCase):
 
     options = build_pipeline_options_with_defaults(
-        argv=["--country_code=bra", '--source=""', '--destination=""', '--start_date=""', '--end_date=""']
+        argv=[
+            "--country_code=bra",
+            '--source=""',
+            '--destination=""',
+            '--start_date=""',
+            '--end_date=""',
+        ]
     )
 
     # Our input data, which will make up the initial PCollection.
@@ -34,7 +40,7 @@ class TestBRANormalize(unittest.TestCase):
     # Our output data, which is the expected data that the final PCollection must match.
     EXPECTED = [
         {
-            "msgid": "bb84c21c72cb41d85d724245b52cf2ab",
+            "msgid": "5411260da7a997217b865007b51e7295",
             "source": "ONYXSAT_BRAZIL_VMS",
             "source_type": "VMS",
             "source_tenant": "BRA",
@@ -43,18 +49,18 @@ class TestBRANormalize(unittest.TestCase):
             "source_ssvid": "4961089",
             "type": "VMS",
             "internal_id": "4961089",
-            "ssvid": "BRA|i:4961089|s:Cibradep X",
+            "ssvid": "BRA|i:4961089|s:CIBRADEP10",
             "timestamp": datetime.fromisoformat("2024-05-01 05:35:45+00:00"),
             "lat": -1.21861112117767,
             "lon": -48.4911117553711,
             "speed": 9.1792656587473,
             "course": 192.0,
             "heading": None,
-            "shipname": "Cibradep X",
+            "shipname": "CIBRADEP X",
             "callsign": None,
             "destination": None,
             "imo": None,
-            "shiptype": "fishing",
+            "shiptype": "FISHING",
             "receiver_type": None,
             "receiver": None,
             "length": None,
@@ -63,7 +69,9 @@ class TestBRANormalize(unittest.TestCase):
             "class_b_cs_flag": None,
             "received_at": None,
             "ingested_at": None,
-            "timestamp_date": datetime.date(datetime.fromisoformat("2024-05-01 05:35:45+00:00")),
+            "timestamp_date": datetime.date(
+                datetime.fromisoformat("2024-05-01 05:35:45+00:00")
+            ),
         }
     ]
 
@@ -78,4 +86,6 @@ def test_normalize(self):
             output: pvalue.PCollection = input | BRANormalize(feed="bra")
 
             # Assert that the output PCollection matches the EXPECTED data.
-            assert_that(output, pcol_equal_to(TestBRANormalize.EXPECTED), label="CheckOutput")
+            assert_that(
+                output, pcol_equal_to(TestBRANormalize.EXPECTED), label="CheckOutput"
+            )
diff --git a/packages/pipe-vms-ingestion/tests/vms_ingestion/normalization/feeds/test_chl_normalize.py b/packages/pipe-vms-ingestion/tests/vms_ingestion/normalization/feeds/test_chl_normalize.py
@@ -13,7 +13,13 @@
 class TestCHLNormalize(unittest.TestCase):
 
     options = build_pipeline_options_with_defaults(
-        argv=["--country_code=chl", '--source=""', '--destination=""', '--start_date=""', '--end_date=""']
+        argv=[
+            "--country_code=chl",
+            '--source=""',
+            '--destination=""',
+            '--start_date=""',
+            '--end_date=""',
+        ]
     )
 
     # Our input data, which will make up the initial PCollection.
@@ -33,15 +39,15 @@ class TestCHLNormalize(unittest.TestCase):
     # Our output data, which is the expected data that the final PCollection must match.
     EXPECTED = [
         {
-            "msgid": "1d4e4dde1f0178df0f396c14587feb04",
-            "source": "chile_vms_some_fleet",
+            "msgid": "c8de2f410ae274c94fbd6583f722fc6b",
+            "source": "CHILE_VMS_SOME_FLEET",
             "source_type": "VMS",
             "source_tenant": "CHL",
             "source_provider": "SERNAPESCA",
-            "source_fleet": "some_fleet",
+            "source_fleet": "SOME_FLEET",
             "source_ssvid": None,
             "type": "VMS",
-            "ssvid": "CHL|s:AUSTRAL TRAVELER|c:ABC123",
+            "ssvid": "CHL|s:AUSTRALTRAVELER|c:ABC123",
             "timestamp": datetime.fromisoformat("2020-01-01 20:23:01+00:00"),
             "lat": -52.546,
             "lon": -71.947,
@@ -62,7 +68,9 @@ class TestCHLNormalize(unittest.TestCase):
             "class_b_cs_flag": None,
             "received_at": None,
             "ingested_at": None,
-            "timestamp_date": datetime.date(datetime.fromisoformat("2020-01-01 20:23:01+00:00")),
+            "timestamp_date": datetime.date(
+                datetime.fromisoformat("2020-01-01 20:23:01+00:00")
+            ),
         }
     ]
 
@@ -77,4 +85,6 @@ def test_normalize(self):
             output: pvalue.PCollection = input | CHLNormalize(feed="chl")
 
             # Assert that the output PCollection matches the EXPECTED data.
-            assert_that(output, pcol_equal_to(TestCHLNormalize.EXPECTED), label="CheckOutput")
+            assert_that(
+                output, pcol_equal_to(TestCHLNormalize.EXPECTED), label="CheckOutput"
+            )
diff --git a/packages/pipe-vms-ingestion/tests/vms_ingestion/normalization/feeds/test_cri_normalize.py b/packages/pipe-vms-ingestion/tests/vms_ingestion/normalization/feeds/test_cri_normalize.py
@@ -13,7 +13,13 @@
 class TestCRINormalize(unittest.TestCase):
 
     options = build_pipeline_options_with_defaults(
-        argv=["--country_code=cri", '--source=""', '--destination=""', '--start_date=""', '--end_date=""']
+        argv=[
+            "--country_code=cri",
+            '--source=""',
+            '--destination=""',
+            '--start_date=""',
+            '--end_date=""',
+        ]
     )
 
     # Our input data, which will make up the initial PCollection.
@@ -51,24 +57,26 @@ class TestCRINormalize(unittest.TestCase):
             "lat": 9.9798,
             "length": None,
             "lon": -84.8221,
-            "msgid": "758d0202a1b6b7166ddb9851e6731091",
+            "msgid": "0df1eb16d80e0b61fb9639177b3c71fb",
             "received_at": None,
             "receiver": None,
             "receiver_type": None,
             "registry_number": None,
             "shipname": "K'IN",
             "shiptype": None,
-            "source": "costarica_vms_sardineros",
-            "source_fleet": "sardineros",
+            "source": "COSTARICA_VMS_SARDINEROS",
+            "source_fleet": "SARDINEROS",
             "source_provider": "INCOPESCA",
             "source_tenant": "CRI",
             "source_ssvid": None,
             "source_type": "VMS",
             "speed": 0.0,
-            "ssvid": "CRI|s:K'IN",
+            "ssvid": "CRI|s:KIN",
             "status": None,
             "timestamp": datetime.fromisoformat("2024-05-01 12:15:01+00:00"),
-            "timestamp_date": datetime.date(datetime.fromisoformat("2024-05-01 12:15:01+00:00")),
+            "timestamp_date": datetime.date(
+                datetime.fromisoformat("2024-05-01 12:15:01+00:00")
+            ),
             "type": "VMS",
             "width": None,
         },
@@ -85,4 +93,6 @@ def test_normalize(self):
             output: pvalue.PCollection = input | CRINormalize(feed="cri")
 
             # Assert that the output PCollection matches the EXPECTED data.
-            assert_that(output, pcol_equal_to(TestCRINormalize.EXPECTED), label="CheckOutput")
+            assert_that(
+                output, pcol_equal_to(TestCRINormalize.EXPECTED), label="CheckOutput"
+            )
diff --git a/packages/pipe-vms-ingestion/vms_ingestion/normalization/transforms/map_normalized_message.py b/packages/pipe-vms-ingestion/vms_ingestion/normalization/transforms/map_normalized_message.py
@@ -1,6 +1,12 @@
 from datetime import datetime
 
 import apache_beam as beam
+from shipdataprocess.normalize import normalize_callsign, normalize_shipname
+from shipdataprocess.standardize import (
+    standardize_imo,
+    standardize_int_str,
+    standardize_str,
+)
 from vms_ingestion.normalization.transforms.calculate_msgid import get_message_id
 from vms_ingestion.normalization.transforms.calculate_ssvid import encode_ssvid
 
@@ -9,33 +15,33 @@ def map_normalized_message(msg, feed, source_provider, source_format):
     result = {
         **msg,
         "source_type": "VMS",
-        "source_tenant": feed.upper(),
-        "source_provider": source_provider,
-        "source_fleet": msg.get("fleet"),
+        "source_tenant": standardize_str(feed),
+        "source_provider": standardize_str(source_provider),
+        "source_fleet": standardize_str(msg.get("fleet")),
         "source_ssvid": msg.get("internal_id"),
-        "type": msg.get("type", "VMS"),
+        "type": standardize_str(msg.get("type", "VMS")),
         "timestamp": msg["timestamp"],
         "lat": msg["lat"],
         "lon": msg["lon"],
         "speed": msg.get("speed"),
         "course": msg.get("course"),
         "heading": msg.get("heading"),
-        "shipname": msg["shipname"],
-        "callsign": msg["callsign"] if msg["callsign"] else None,
+        "shipname": standardize_str(msg["shipname"]),
+        "callsign": standardize_str(msg["callsign"]) if msg["callsign"] else None,
         "destination": msg.get("destination"),
-        "imo": msg.get("imo"),
-        "shiptype": msg.get("shiptype"),
+        "imo": standardize_imo(msg.get("imo")),
+        "shiptype": standardize_str(msg.get("shiptype")),
         "receiver_type": msg.get("receiver_type"),
         "receiver": msg.get("receiver"),
         "length": msg.get("length"),
         "width": msg.get("width"),
-        "status": msg.get("status"),
-        "class_b_cs_flag": msg.get("class_b_cs_flag"),
+        "status": standardize_int_str(msg.get("status")),
+        "class_b_cs_flag": standardize_int_str(msg.get("class_b_cs_flag")),
         "received_at": msg.get("received_at"),
         "ingested_at": msg.get("ingested_at"),
         "timestamp_date": datetime.date(msg["timestamp"]),
     }
-    return {**result, "source": source_format.format(**result)}
+    return {**result, "source": standardize_str(source_format.format(**result))}
 
 
 class MapNormalizedMessage(beam.PTransform):
@@ -46,7 +52,12 @@ def __init__(self, feed, source_provider, source_format):
         self.source_format = source_format
 
     def expand(self, pcoll):
-        return pcoll | self.map_normalized_message() | self.calculate_ssvid() | self.calculate_message_id()
+        return (
+            pcoll
+            | self.map_normalized_message()
+            | self.calculate_ssvid()
+            | self.calculate_message_id()
+        )
 
     def map_normalized_message(self):
         return beam.Map(
@@ -81,8 +92,8 @@ def calculate_ssvid(self):
                 "ssvid": encode_ssvid(
                     country=msg["source_tenant"],
                     internal_id=msg.get("internal_id"),
-                    shipname=msg.get("shipname"),
-                    callsign=msg.get("callsign"),
+                    shipname=normalize_shipname(msg.get("shipname")),
+                    callsign=normalize_callsign(msg.get("callsign")),
                     licence=msg.get("licence"),
                 ),
             }