From 4812ff4b13e2a8f530d260885dbd4eaa002b752c Mon Sep 17 00:00:00 2001 From: jaeyoonpark Date: Wed, 26 Jan 2022 13:24:51 +0100 Subject: [PATCH 1/4] update setup and others --- .pre-commit-config.yaml | 11 + pyproject.toml | 8 + setup.cfg | 4 + setup.py | 20 +- shipdataprocess/__init__.py | 10 +- shipdataprocess/collapse.py | 192 ++++++--- shipdataprocess/normalize.py | 325 ++++++++------- shipdataprocess/shiptype.py | 658 +++++++++++++++++++------------ shipdataprocess/standardize.py | 529 ++++++++++++++++++------- tests/test_normalize_shipname.py | 55 ++- 10 files changed, 1186 insertions(+), 626 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ca8c002 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +repos: +- repo: https://github.com/psf/black + rev: 21.11b0 + hooks: + - id: black + args: [--line-length=79] +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..eb1bdcb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] + +[tool.black] +line-length = 79 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index a0183fb..c62e09e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,6 @@ [metadata] Obsoletes-Dist: ShipDataProcess + +[options.extras_require] +test = + pytest diff --git a/setup.py b/setup.py index 61eff23..58342bf 100644 --- a/setup.py +++ b/setup.py @@ -5,21 +5,15 @@ """ import codecs -import os from setuptools import find_packages from setuptools import setup -package = __import__('shipdataprocess') +package = __import__("shipdataprocess") -DEPENDENCIES = [ - "pytest", - "unidecode", - "roman", - "Django" -] +DEPENDENCIES = ["pytest", "unidecode", "roman"] -with codecs.open('README.md', encoding='utf-8') as f: +with codecs.open("README.md", encoding="utf-8") as f: readme = f.read().strip() setup( @@ -28,13 +22,13 @@ description=package.__doc__.strip(), include_package_data=True, install_requires=DEPENDENCIES, - keywords=['ship','vessel','fishing','normalization'], + python_requires=">=3.6", + keywords=["ship", "vessel", "fishing", "normalization"], license="Apache 2.0", long_description=readme, - name='shipdataprocess', - packages=find_packages(exclude=['test*.*', 'tests']), + name="shipdataprocess", + packages=find_packages(exclude=["test*.*", "tests"]), url=package.__source__, version=package.__version__, zip_safe=True, ) - diff --git a/shipdataprocess/__init__.py b/shipdataprocess/__init__.py index 7b322e5..de625df 100644 --- a/shipdataprocess/__init__.py +++ b/shipdataprocess/__init__.py @@ -3,15 +3,15 @@ """ -__version__ = '0.6.18' -__author__ = 'Jaeyoon Park' -__email__ = 'jaeyoon.park13@gmail.com' -__source__ = 'https://github.com/GlobalFishingWatch/shipdataprocess' +__version__ = "0.7.1" +__author__ = "Jaeyoon Park" +__email__ = "jaeyoon@globalfishingwatch.org" +__source__ = "https://github.com/GlobalFishingWatch/shipdataprocess" __license__ = """ Copyright 2017 Global Fishing Watch Inc. Authors: -Jaeyoon Park +Jaeyoon Park Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/shipdataprocess/collapse.py b/shipdataprocess/collapse.py index b18a0ce..823a580 100644 --- a/shipdataprocess/collapse.py +++ b/shipdataprocess/collapse.py @@ -1,32 +1,51 @@ +""" +This file contains functions that help collapse (multiple) rows +for each vessel found in the process of producing Global Fishing Watch's +internal vessel database. + +Last updated: 2022-01-24 +Jaeyoon Park +""" + import pandas as pd import numpy as np import re from collections import Counter -### helper functions for collapsing rows by vessel - def non_zero_mean(x): try: - x = x[(x!=0)&(x!=None)] - if len(x)==0: return 0.0 - else: return x.mean() - except: + x = x[(x != 0) & (x is not None)] + if len(x) == 0: + return 0.0 + else: + return x.mean() + except AttributeError: return 0.0 - + + def non_zero_std(x): try: - x = x[(x!=0)&(x!=None)] - if len(x)<2: return 0.0 - else: return x.std() - except: + x = x[(x != 0) & (x is not None)] + if len(x) < 2: + return 0.0 + else: + return x.std() + except AttributeError: return 0.0 - -def most_common_value(x): ## remove if standard deviation is too big compared to mean value of all numbers - '''remove if standard deviation is too big compared to mean value of all numbers''' - if (type(x)==list)&(len(x)>0): + + +def most_common_value(x): + """ + Remove if standard deviation is too big compared to mean value of + all numbers. The standard deviation threshold is set to be 10%. + + x: Pandas Series or list, a list of numerical values + (for length, tonnage, engine power) + """ + if (type(x) == list) & (len(x) > 0): x = pd.Series(x) - if (type(x)==pd.core.series.Series)&(len(x.dropna())>0): + if (type(x) == pd.core.series.Series) & (len(x.dropna()) > 0): x_mean = non_zero_mean(x) x_std = non_zero_std(x) if x_std > x_mean * 0.1: @@ -36,95 +55,156 @@ def most_common_value(x): ## remove if standard deviation is too big compared to else: return np.nan + def most_common_value_with_confidence(cx): - '''same functionality as most_common_value() but with confidence level taken account''' - if (type(cx)==pd.core.series.Series)&(len(cx)>0): - if len(cx.dropna())==0: + """ + same functionality as most_common_value() but with confidence level + taken into account + + cx: Pandas Series or list, a list of numerical values + (for length, tonnage, engine power) + with a confidence level indicator attached with '-' in front of the value. + """ + if (type(cx) == pd.core.series.Series) & (len(cx) > 0): + if len(cx.dropna()) == 0: return np.nan else: cx = list(cx.values) - if (type(cx)==list)&(len(cx)>0): - clist = [int(elem.split('-')[0]) for elem in cx if (elem==elem)&(elem!=None)] - xlist = [elem for elem in cx if (elem==elem)&(elem!=None)] - if len(clist)>0: + if (type(cx) == list) & (len(cx) > 0): + clist = [ + int(elem.split("-")[0]) + for elem in cx + if (elem == elem) & (elem is not None) + ] + xlist = [elem for elem in cx if (elem == elem) & (elem is not None)] + if len(clist) > 0: max_c = max(clist) - x = [float(elem.split('-')[1]) for elem in xlist if int(elem.split('-')[0])==max_c] + x = [ + float(elem.split("-")[1]) + for elem in xlist + if int(elem.split("-")[0]) == max_c + ] + # Call the function to return the most common value return most_common_value(x) else: return np.nan else: return np.nan - -def most_common_num(x): ## mostly for imo collapsing + + +def most_common_num(x): + """ + Return the most common number (mostly for imo collapsing). + + x: Pandas Series, a list of numbers + """ try: x = x.dropna() - if len(x)==0: + if len(x) == 0: return np.nan else: vals = x.values - vs = [v for v in vals if (v!=0)] - #vs = list(set(vs)) - if len(vs)==0: + vs = [v for v in vals if (v != 0)] + # vs = list(set(vs)) + if len(vs) == 0: return np.nan - else: + else: data = Counter(vs) return max(vs, key=data.get) - except: + except AttributeError: return np.nan - + + def most_common_str(x): + """ + Return the most common string. + + x: Pandas Series, a list of values in string + """ try: x = x.dropna() - if len(x)==0: + if len(x) == 0: return np.nan else: - vals = x.values - vs = [re.sub('\s+',' ',str(v)).strip().upper() for v in x.values] - vs = [v for v in vs if v!=''] - #vs = list(set(vs)) - if len(vs)==0: + vs = [ + re.sub(r"\s+", " ", str(v)).strip().upper() for v in x.values + ] + vs = [v for v in vs if v != ""] + # vs = list(set(vs)) + if len(vs) == 0: return np.nan else: data = Counter(vs) return max(vs, key=data.get) - #if len(vs)==1: - # return vs[0] - #else: - # return None - except: + except AttributeError: return np.nan - -def str_attached(x): ## join all strings + + +def str_attached(x): + """ + Return all strings joined. If the values are in numbers, convert them + to string and combined. + + :param x: Pandas Series or list + :return: A joined string + """ try: x = x.dropna() - if len(x)==0: + if len(x) == 0: return np.nan else: - x = x.apply(lambda v: str(int(v)) if (type(v)==float)|(type(v)==int)|(type(v)==long) else v) + x = x.apply( + lambda v: str(int(v)) + if (type(v) == float) | (type(v) == int) + else v + ) vals = x.values.tolist() - #vs = [str(v).strip() for v in vals if (v==v)&(v!=None)&(v!='')] - #vs = [v for v in vs if (v!='')] + # vs = [str(v).strip() for v in vals if (v==v)&(v!=None)&(v!='')] + # vs = [v for v in vs if (v!='')] vs = list(set(vals)) - return ', '.join(sorted(vs)) - except: + return ", ".join(sorted(vs)) + except AttributeError: return np.nan - + + def min_time(x): + """ + Return the minimum time + + :param x: Pandas Series + :return: Timestamp + """ vals = x.values - vs = [v for v in vals if (v==v)&(v!=None)&(v!='')] + vs = [v for v in vals if (v == v) & (v is not None) & (v != "")] vs = pd.Series(vs) + return vs.min() + def max_time(x): + """ + Return the maximum time + + :param x: Pandas Series + :return: Timestamp + """ vals = x.values - vs = [v for v in vals if (v==v)&(v!=None)&(v!='')] + vs = [v for v in vals if (v == v) & (v is not None) & (v != "")] vs = pd.Series(vs) + return vs.max() + def highest_confidence(x): + """ + Return the maximum confidence if none return 1 (the lowest). + + :param x: Pandas Series or list + :return: Integer + """ x = x.dropna() - if len(x)>0: + if len(x) > 0: return max(x.tolist()) else: return 1 diff --git a/shipdataprocess/normalize.py b/shipdataprocess/normalize.py index 29104a9..c95a2cd 100644 --- a/shipdataprocess/normalize.py +++ b/shipdataprocess/normalize.py @@ -1,205 +1,260 @@ +""" +This file provides functions that normalize ship name and call sign of a vessel +either recorded in registries or in vessel tracking data. The normalization, or +standardization of string, will ensure that the strings are comparable to other +strings despite various ways of recording names of the same vessel. +It also removes all non-essential characters or white spaces. +""" from unidecode import unidecode import roman import re -import sys def normalize_shipname(name): - - if (name is None)|(name != name)|(name == ''): - return None + """ + Return a normalized ship name by removing all non-essential characters, + prefix, and suffix, and standardizing roman numerals or other parts + of the vessel name. - # - # Turn to upper cases - name = name.upper() - + :param name: String, an original vessel name + :return: String, a normalized vessel name + """ + if (name is None) | (name != name) | (name == ""): + return None + print(name) # # Remove nasty characters and white spaces - if sys.version_info[0] < 3: + # try: + # name = unidecode(str(name.decode("utf-8"))) + # except UnicodeDecodeError: + # name = unidecode(str(name.decode("iso_8859-1"))) + + if issubclass(type(name), str): + name = unidecode(name) + elif isinstance(name, bytes): try: - name = unidecode(str(name)) # get rid of nasty characters, but sometimes this fails - except: - try: - name = unidecode(str(name.decode('utf8'))) - except: - name = unidecode(str(name.decode('iso_8859-1'))) + name = unidecode(str(name, "utf-8", "strict")) + except UnicodeDecodeError: + name = unidecode(str(name, "iso-8859-1", "strict")) + elif isinstance(name, int): + name = str(name) else: - name = unidecode(str(name)) + return None + print(name) + # + # Turn to upper cases + name = name.upper() - name = re.sub('\s+',' ',name) + name = re.sub(r"\s+", " ", name) name = name.strip() - name = name.replace('\n','').replace('\r','') - + name = name.replace("\n", "").replace("\r", "") + # # Remove fishing vessel code - name = re.sub('MFV[^\w]+', ' ', name) # fishing vessel code in English - name = re.sub('MPV[^\w]+', ' ', name) # fishing vessel code in English - name = re.sub('HMS[^\w]+', ' ', name) # fishing vessel code in English - name = re.sub('LPG[/|C]*[\W]*|LNG[/|C]*[\W]*', ' ', name) # LPG/LNG variations - - name = re.sub('(\s|^)F[^\w\s]*V[^\w]*', ' ', name) # fishing vessel code in English (F/V, F-V, F.V, FV: etc) - name = re.sub('^F[^\w\s]*B[^\w]+', ' ', name) # fishing vessel code in English - name = re.sub(' F[^\w\s]*B[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]*P[^\w]+', ' ', name) # fishing vessel code in Italy/Spain - name = re.sub(' M[^\w\s]*P[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]*B[^\w]+', ' ', name) # fishing vessel code in Italy/Spain - name = re.sub(' M[^\w\s]*B[^\w]*(\s|$)', ' ', name) - name = re.sub('^G[^\w\s]*V[^\w]+', ' ', name) # mostly in UK - name = re.sub('S+F+[^\w]+G[^\w\s]*V[^\w]*', ' ', name) - name = re.sub(' G[^\w\s]*V[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]*V[^\w]+', ' ', name) # in English - name = re.sub(' M[^\w\s]*V[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]+S[^\w]+', ' ', name) # Merchant Ship - name = re.sub(' M[^\w\s]+S[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]*K[^\w]+', ' ', name) # mostly in northern europe - name = re.sub(' M[^\w\s]+K[^\w]*(\s|$)', ' ', name) - name = re.sub('^R[^\w\s]*V[^\w]+', ' ', name) # Research Vessel - name = re.sub(' R[^\w\s]*V[^\w]*(\s|$)', ' ', name) - - name = re.sub('^T[^\w\s]*T[^\w]+', ' ', name) # Tender To - name = re.sub(' T[^\w\s]*T[^\w]*($)', ' ', name) - name = re.sub('^S[^\w\s]*Y[^\w]+', ' ', name) # Steam Yacht - name = re.sub(' S[^\w\s]*Y[^\w]*($)', ' ', name) - name = re.sub('^M[^\w\s]*F[^\w]+', ' ', name) # Motor Ferry - name = re.sub(' M[^\w\s]*F[^\w]*($)', ' ', name) - name = re.sub('^S[^\w\s]*S[^\w]+', ' ', name) # Steam Ship - name = re.sub(' S[^\w\s]*S[^\w]*($)', ' ', name) - name = re.sub('^S[^\w\s]*V[^\w]+', ' ', name) # Sailing Vessel - name = re.sub(' S[^\w\s]*V[^\w]*($)', ' ', name) - name = re.sub('^M[^\w\s]*T[^\w]+', ' ', name) # Motor Tanker - name = re.sub(' M[^\w\s]*T[^\w]*($)', ' ', name) - name = re.sub('^M[^\w\s]+Y[^\w]+', ' ', name) # Motor Yacht - name = re.sub(' M[^\w\s]+Y[^\w]*($)', ' ', name) - name = re.sub('^[A-Z]/[A-Z][^\w]+', ' ', name) # All other types of X/X - name = re.sub(' [A-Z]/[A-Z]($)', ' ', name) - name = re.sub('^[A-Z]\\\\[A-Z][^\w]+', ' ', name) ## All other types of X\X - name = re.sub(' [A-Z]\\\\[A-Z]($)', ' ', name) - name = re.sub('^KM[^\w]+', ' ', name) # Indonesia K.M - name = re.sub('^E.B. ', ' ', name) # Dutch E.B. equivalent to NO. - - name = re.sub('\(.+\)', ' ', name) # All additional information in parentheses - name = re.sub('\[.+\]', ' ', name) - + name = re.sub(r"MFV[^\w]+", " ", name) # fishing vessel code in English + name = re.sub(r"MPV[^\w]+", " ", name) # fishing vessel code in English + name = re.sub(r"HMS[^\w]+", " ", name) # fishing vessel code in English + name = re.sub( + r"LPG[/|C]*[\W]*|LNG[/|C]*[\W]*", " ", name + ) # LPG/LNG variations + + name = re.sub( + r"(\s|^)F[^\w\s]*V[^\w]*", " ", name + ) # fishing vessel code in English (F/V, F-V, F.V, FV: etc) + name = re.sub( + r"^F[^\w\s]*B[^\w]+", " ", name + ) # fishing vessel code in English + name = re.sub(r" F[^\w\s]*B[^\w]*(\s|$)", " ", name) + name = re.sub( + r"^M[^\w\s]*P[^\w]+", " ", name + ) # fishing vessel code in Italy/Spain + name = re.sub(r" M[^\w\s]*P[^\w]*(\s|$)", " ", name) + name = re.sub( + r"^M[^\w\s]*B[^\w]+", " ", name + ) # fishing vessel code in Italy/Spain + name = re.sub(r" M[^\w\s]*B[^\w]*(\s|$)", " ", name) + name = re.sub(r"^G[^\w\s]*V[^\w]+", " ", name) # mostly in UK + name = re.sub(r"S+F+[^\w]+G[^\w\s]*V[^\w]*", " ", name) + name = re.sub(r" G[^\w\s]*V[^\w]*(\s|$)", " ", name) + name = re.sub(r"^M[^\w\s]*V[^\w]+", " ", name) # in English + name = re.sub(r" M[^\w\s]*V[^\w]*(\s|$)", " ", name) + name = re.sub(r"^M[^\w\s]+S[^\w]+", " ", name) # Merchant Ship + name = re.sub(r" M[^\w\s]+S[^\w]*(\s|$)", " ", name) + name = re.sub(r"^M[^\w\s]*K[^\w]+", " ", name) # mostly in northern europe + name = re.sub(r" M[^\w\s]+K[^\w]*(\s|$)", " ", name) + name = re.sub(r"^R[^\w\s]*V[^\w]+", " ", name) # Research Vessel + name = re.sub(r" R[^\w\s]*V[^\w]*(\s|$)", " ", name) + + name = re.sub(r"^T[^\w\s]*T[^\w]+", " ", name) # Tender To + name = re.sub(r" T[^\w\s]*T[^\w]*($)", " ", name) + name = re.sub(r"^S[^\w\s]*Y[^\w]+", " ", name) # Steam Yacht + name = re.sub(r" S[^\w\s]*Y[^\w]*($)", " ", name) + name = re.sub(r"^M[^\w\s]*F[^\w]+", " ", name) # Motor Ferry + name = re.sub(r" M[^\w\s]*F[^\w]*($)", " ", name) + name = re.sub(r"^S[^\w\s]*S[^\w]+", " ", name) # Steam Ship + name = re.sub(r" S[^\w\s]*S[^\w]*($)", " ", name) + name = re.sub(r"^S[^\w\s]*V[^\w]+", " ", name) # Sailing Vessel + name = re.sub(r" S[^\w\s]*V[^\w]*($)", " ", name) + name = re.sub(r"^M[^\w\s]*T[^\w]+", " ", name) # Motor Tanker + name = re.sub(r" M[^\w\s]*T[^\w]*($)", " ", name) + name = re.sub(r"^M[^\w\s]+Y[^\w]+", " ", name) # Motor Yacht + name = re.sub(r" M[^\w\s]+Y[^\w]*($)", " ", name) + name = re.sub(r"^[A-Z]/[A-Z][^\w]+", " ", name) # All other types of X/X + name = re.sub(r" [A-Z]/[A-Z]($)", " ", name) + name = re.sub( + r"^[A-Z]\\\\[A-Z][^\w]+", " ", name + ) # All other types of X\X + name = re.sub(r" [A-Z]\\\\[A-Z]($)", " ", name) + name = re.sub(r"^KM[^\w]+", " ", name) # Indonesia K.M + name = re.sub(r"^E.B. ", " ", name) # Dutch E.B. equivalent to NO. + + name = re.sub( + r"\(.+\)", " ", name + ) # All additional information in parentheses + name = re.sub(r"\[.+\]", " ", name) + # # Numbers in letters - name = re.sub(' ONE($)| UNO($)| UN($)', ' 1', name) - name = re.sub(' TWO($)| DOS($)| DEUX($)', ' 2', name) - name = re.sub(' THREE($)| TRES($)| TROIS($)', ' 3', name) - name = re.sub(' FOUR($)| CUATRO($)| QUATRE($)', ' 4', name) - name = re.sub(' FIVE($)| CINCO($)| CINQ($)', ' 5', name) - name = re.sub(' SIX($)| SEIS($)| SIX($)', ' 6', name) - name = re.sub(' SEVEN($)| SIETE($)| SEPT($)', ' 7', name) - name = re.sub(' EIGHT($)| OCHO($)| HUIT($)', ' 8', name) - name = re.sub(' NINE($)| NUEVE($)| NEUF($)', ' 9', name) - name = re.sub(' TEN($)| DIEZ($)| DIX($)', ' 10', name) - name = re.sub(' ELEVEN($)| ONCE($)| ONZE($)', ' 11', name) - name = re.sub(' TWELVE($)| DOCE($)| DOUZE($)', ' 12', name) - name = re.sub(' THIRTEEN($)| TRECE($)| TREIZE($)', ' 13', name) - name = re.sub(' FOURTEEN($)| CATORCE($)| QUATORZE($)', ' 14', name) - name = re.sub(' FIFTEEN($)| QUINCE($)| QUINZE($)', ' 15', name) - - name = re.sub('1ST ', 'FIRST ', name) - name = re.sub('2ND ', 'SECOND ', name) - name = re.sub('3RD ', 'THIRD ', name) - name = re.sub('4TH ', 'FOURTH ', name) - name = re.sub('5TH ', 'FIFTH ', name) + name = re.sub(r" ONE($)| UNO($)| UN($)", " 1", name) + name = re.sub(r" TWO($)| DOS($)| DEUX($)", " 2", name) + name = re.sub(r" THREE($)| TRES($)| TROIS($)", " 3", name) + name = re.sub(r" FOUR($)| CUATRO($)| QUATRE($)", " 4", name) + name = re.sub(r" FIVE($)| CINCO($)| CINQ($)", " 5", name) + name = re.sub(r" SIX($)| SEIS($)", " 6", name) + name = re.sub(r" SEVEN($)| SIETE($)| SEPT($)", " 7", name) + name = re.sub(r" EIGHT($)| OCHO($)| HUIT($)", " 8", name) + name = re.sub(r" NINE($)| NUEVE($)| NEUF($)", " 9", name) + name = re.sub(r" TEN($)| DIEZ($)| DIX($)", " 10", name) + name = re.sub(r" ELEVEN($)| ONCE($)| ONZE($)", " 11", name) + name = re.sub(r" TWELVE($)| DOCE($)| DOUZE($)", " 12", name) + name = re.sub(r" THIRTEEN($)| TRECE($)| TREIZE($)", " 13", name) + name = re.sub(r" FOURTEEN($)| CATORCE($)| QUATORZE($)", " 14", name) + name = re.sub(r" FIFTEEN($)| QUINCE($)| QUINZE($)", " 15", name) + + name = re.sub("1ST ", "FIRST ", name) + name = re.sub("2ND ", "SECOND ", name) + name = re.sub("3RD ", "THIRD ", name) + name = re.sub("4TH ", "FOURTH ", name) + name = re.sub("5TH ", "FIFTH ", name) # # Country specific appendix (S. Korea and China) - name = re.sub('\d+\s*HO($)', ' ', name) - name = re.sub('\d+\s*HAO($)', ' ', name) + name = re.sub(r"\d+\s*HO($)", " ", name) + name = re.sub(r"\d+\s*HAO($)", " ", name) # # Remove NO.s such in NO.5, NO5, NO:5, NO. 5, NO 5, N5, N-5 etc - name = re.sub('NO[^\w\s]*[\s]*(?=\d+)', '', name) - name = re.sub('[\s]+N[\W_0]*(?=\d+)', '', name) - name = re.sub('NO\.\s*(?=[^0-9]+)', '', name) - + name = re.sub(r"NO[^\w\s]*[\s]*(?=\d+)", "", name) + name = re.sub(r"[\s]+N[\W_0]*(?=\d+)", "", name) + name = re.sub(r"NO\.\s*(?=[^0-9]+)", "", name) + # # Turn '&' to 'AND' - name = re.sub('(?<=[A-Z])\s+&\s+(?=[A-Z])', ' AND ', name) # replace 'BLACK & WHITE' to 'BLACK AND WHITE' - + name = re.sub( + r"(?<=[A-Z])\s+&\s+(?=[A-Z])", " AND ", name + ) # replace 'BLACK & WHITE' to 'BLACK AND WHITE' + # # Deromanization - vs = re.split('\s+|-|(?<=[A-Z]{3})\.',name) + vs = re.split(r"\s+|-|(?<=[A-Z]{3})\.", name) try: # # If last word from the name text has L/C/D/M then do not deromanize - if re.search('[LCDM]', vs[-1]).group(0): pass - except: + if re.search(r"[LCDM]", vs[-1]).group(0): + pass + except AttributeError: # # Try to deromanize the last word from the name text try: vs[-1] = roman.fromRoman(vs[-1]) vs[-1] = str(int(vs[-1])) - except: + except roman.InvalidRomanNumeralError: + # + # No corresponding roman numeral found. Let's leave it as is. pass - + # # Attach the deromanized digits to the end - name = ''.join(vs) + name = "".join(vs) - # # Now, remove all special characters - name = re.sub('[\W_]', '', name) - + name = re.sub(r"[\W_]", "", name) + # # Check if the name starts with digits, if yes move it to the end - try: - first_digit = re.search('^\d+', name).group(0) - name = re.sub('^\d+', '', name) + str(first_digit) - except: - pass + obj = re.search(r"^\d+", name) + if obj: + first_digit = obj.group(0) + name = re.sub(r"^\d+", "", name) + str(first_digit) # # Remove 0s from the numbers starting with 0s - try: - last_digit = re.search('\d+$', name).group(0) - non_zeros = re.sub('^0+', '', last_digit) - name = re.sub('\d+$', '', name) + str(non_zeros) - except: - pass + obj = re.search(r"\d+$", name) + if obj: + last_digit = obj.group(0) + non_zeros = re.sub("^0+", "", last_digit) + name = re.sub(r"\d+$", "", name) + str(non_zeros) + + # + # Remove all excessive white spaces + name = re.sub(r"\s+", " ", name) - if name == '': + if name == "" or name == " ": return None - - return name + else: + return name def normalize_callsign(callsign): + """ + Return a normalized International Radio Call Sign by removing non-essential + characters and ignoring meaningless call sign including 'NONE', 'UNKNOWN' + + :param callsign: String, an original call sign + :return: String, a normalized call sign + """ - if (callsign is None) | (callsign != callsign) | (callsign == '') | \ - (callsign == "NONE") | (callsign == "UNKNOWN") | (callsign == "NIL") | (callsign == "NULL"): + if ( + (callsign is None) + | (callsign != callsign) + | (callsign == "") + | (callsign == "NONE") + | (callsign == "UNKNOWN") + | (callsign == "NIL") + | (callsign == "NULL") + ): return None # # Turn to upper cases callsign = callsign.upper() - + # # Remove nasty characters, white space try: - callsign = unidecode(str(callsign)) # get rid of nasty characters, but sometimes this fails - except: + # + # get rid of nasty characters, but sometimes this fails + callsign = unidecode(str(callsign)) + except UnicodeDecodeError: try: - callsign = unidecode(str(callsign.decode('utf8'))) - except: - callsign = unidecode(str(callsign.decode('iso_8859-1'))) + callsign = unidecode(str(callsign.decode("utf8"))) + except UnicodeDecodeError: + callsign = unidecode(str(callsign.decode("iso_8859-1"))) callsign = callsign.strip() - callsign = re.sub('\s+',' ',callsign) + callsign = re.sub(r"\s+", " ", callsign) # # Get rid of all non-word characters - callsign = re.sub('[\W_]', '', callsign) - + callsign = re.sub(r"[\W_]", "", callsign) + # # Remove 0s from callsign starting with 0s - callsign = re.sub('^0+', '', callsign) - - if callsign == '': - return None + callsign = re.sub(r"^0+", "", callsign) - return callsign \ No newline at end of file + if callsign == "": + return None + else: + return callsign diff --git a/shipdataprocess/shiptype.py b/shipdataprocess/shiptype.py index 70eff72..8ee5773 100644 --- a/shipdataprocess/shiptype.py +++ b/shipdataprocess/shiptype.py @@ -1,226 +1,305 @@ -import pandas as pd +""" +This file provides functions that process operations with regard to vessel +types defined by Global Fishing Watch (There are about 40 ship types +pre-defined). See here +https://globalfishingwatch.org/datasets-and-code-vessel-identity/ + +Last updates: 2022-01-25 +Jaeyoon Park +""" import numpy as np - def determine_shiptype(gears, shiptype_dict): - ''' - determinte_shiptype module receives multiple types of ship and returns the most specific ship type. - - -------- - ARGUMENT - -------- - gears: SERIES, LIST, OR STR, single or multiple combination of ship types joined by '|' (OR) - (examples: fixed_gear|set_longlines, cargo) - -------- - - ------ - RETURN - ------ - STR or None, select the most detailed type among the ship types received if they are all in one category, + """ + This module receives multiple types of ship and returns the most specific + ship type in the pre-defined vessel classification hierarchy. + https://globalfishingwatch.org/datasets-and-code-vessel-identity/ + + :param gears: SERIES, LIST, OR STR, single or multiple combination of ship + type joined by '|' (OR) (examples: fixed_gear|set_longlines, cargo) + :param shiptype_dict: DICT, a geartype dictionary containing 'path' + information in the vessel class hierarchy + :return: STR or None, select the most detailed type among the ship types + received if they are all in one category, otherwise a combination of ship types. - (examples: fixed_gear|set_longlines -> set_longlines, trawler|fixed_gear|set_longlines -> trawler|set_longlines) - ------ - ''' - + (examples: fixed_gear|set_longlines -> set_longlines, + trawler|fixed_gear|set_longlines -> trawler|set_longlines) + """ - ## if there is no information on gears, then return None - if len(gears)==0: + # + # if there is no information on gears, then return None + if len(gears) == 0: return None - - ### make sure the entry is a list of strings - if type(gears)==str: + + # + # make sure the entry is a list of strings + if type(gears) == str: gears = [gears] - elif type(gears)==list: + elif type(gears) == list: pass - else: gears = gears.tolist() - - ### remove Nones - gears = [gear.replace(' ','').strip() for gear in gears if (gear!=None)&(gear==gear)&(gear!='')] - - ### take only specific ones if there are several possibly duplicated ones (example: trawlers, trawlers|purse_seines) + else: + gears = gears.tolist() + + # + # remove Nones + gears = [ + gear.replace(" ", "").strip() + for gear in gears + if (gear is not None) & (gear == gear) & (gear != "") + ] + + # + # take only specific ones if there are several possibly duplicated ones + # (example: trawlers, trawlers|purse_seines) gears = reduce_to_specifics_with_multiples(gears, shiptype_dict) - ### get rid of '|' and take all possible gears individually - gears_split=[] + # + # get rid of '|' and take all possible gears individually + gears_split = [] for g in gears: - if '|' in g: - gears_split += g.split('|') + if "|" in g: + gears_split += g.split("|") else: gears_split.append(g) - - ### map geartype_dict to compare categories (broader ones to be removed) + + # + # map geartype_dict to compare categories + # (broader/ ones to be removed) gears = reduce_to_specifics(gears_split, shiptype_dict) - ### remove redundant values and join together with '|' + # + # remove redundant values and join together with '|' gears = sorted(list(set(gears))) - final_value = '|'.join(gears) - if final_value=='': + final_value = "|".join(gears) + if final_value == "": return None else: return final_value - def determine_shiptype_simple(gears, shiptype_dict): - ''' - same as determinte_shiptype module but without reducing multiple gears to specific (this is for testing). - ''' + """ + same as determine_shiptype module but without reducing multiple gears + to specific (this is for testing). + + :param gears: SERIES, LIST, OR STR, single or multiple combination of + ship types joined by '|' (OR) (examples: fixed_gear|set_longlines, cargo) + :param shiptype_dict: DICT, ship type dictionary containing 'path' of + gear type in the hierarchy + :return: STR or None, select the most detailed type among the ship types + received if they are all in one category, + otherwise a combination of ship types. + (examples: fixed_gear|set_longlines -> set_longlines, + trawler|fixed_gear|set_longlines -> trawler|set_longlines) + """ - ## if there is no information on gears, then return None - if len(gears)==0: + # + # if there is no information on gears, then return None + if len(gears) == 0: return None - - ### make sure the entry is a list of strings - if type(gears)==str: + + # + # make sure the entry is a list of strings + if type(gears) == str: gears = [gears] - elif type(gears)==list: + elif type(gears) == list: pass - else: gears = gears.tolist() - - ### remove Nones - gears = [gear.replace(' ','').strip() for gear in gears if (gear!=None)&(gear==gear)&(gear!='')] - - ### get rid of '|' and take all possible gears individually - gears_split=[] + else: + gears = gears.tolist() + + # + # remove Nones + gears = [ + gear.replace(" ", "").strip() + for gear in gears + if (gear is not None) & (gear == gear) & (gear != "") + ] + + # + # get rid of '|' and take all possible gears individually + gears_split = [] for g in gears: - if '|' in g: - gears_split += g.split('|') + if "|" in g: + gears_split += g.split("|") else: gears_split.append(g) - - ### map geartype_dict to compare categories (broader ones to be removed) + + # + # map geartype_dict to compare categories (broader ones to be removed) gears = reduce_to_specifics(gears_split, shiptype_dict) - ### remove redundant values and join together with '|' + # + # remove redundant values and join together with '|' gears = sorted(list(set(gears))) - final_value = '|'.join(gears) - if final_value=='': + final_value = "|".join(gears) + if final_value == "": return None else: return final_value def tag_confidence_level(x, c): - if (x==x)&(x!=None)&(x!=0)&(x!=''): - return str(c) + '-' + str(x) + """ + Helper function to add confidence level to geartype + + :param x: STRING, geartype + :param c: INT, confidence level (1 to 4) + :return: STRING, geartype attached with confidence level by a dash ('-') + """ + if (x == x) & (x is not None) & (x != 0) & (x != ""): + return str(c) + "-" + str(x) else: return np.nan def determine_shiptype_with_confidence(gears, shiptype_dict): - ''' - same as determine_shiptype but with confidence level taken into account - ''' - - ## if there is no information on gears, then return None - if len(gears)==0: + """ + same as the determine_shiptype module above + but with confidence level taken into account + """ + + # + # if there is no information on gears, then return None + if len(gears) == 0: return np.nan - - ### make sure the entry is a list of strings - if type(gears)==str: + + # + # make sure the entry is a list of strings + if type(gears) == str: gears = [gears] - elif type(gears)==list: + elif type(gears) == list: pass - else: gears = gears.tolist() - - ### remove NaN/None - gears = [gear.replace(' ','').strip() for gear in gears if (gear!=None)&(gear==gear)&(gear!='')] - if len(gears)==0: + else: + gears = gears.tolist() + + # + # remove NaN/None + gears = [ + gear.replace(" ", "").strip() + for gear in gears + if (gear is not None) & (gear == gear) & (gear != "") + ] + if len(gears) == 0: return np.nan - - ### remove all gear values from lists of less confidence level - levels = [int(gear.split('-')[0]) for gear in gears] - if len(levels)>0: + + # + # remove all gear values from lists of less confidence level + levels = [int(gear.split("-")[0]) for gear in gears] + if len(levels) > 0: highest_level = max(levels) - if (highest_level==3)&(2 in levels): - gears_3 = [gear.split('-')[1] for gear in gears if ('3' in gear)] - gears_2 = [gear.split('-')[1] for gear in gears if ('2' in gear)] - gears = [gear.split('-')[1] for gear in gears if ('2' in gear)|('3' in gear)] + if (highest_level == 3) & (2 in levels): + gears_3 = [gear.split("-")[1] for gear in gears if ("3" in gear)] + gears_2 = [gear.split("-")[1] for gear in gears if ("2" in gear)] + gears = [ + gear.split("-")[1] + for gear in gears + if ("2" in gear) | ("3" in gear) + ] else: - gears = [gear.split('-')[1] for gear in gears if str(highest_level) in gear] - - ### take only specific ones if there are several possibly duplicated ones (example: trawlers, trawlers|purse_seines) + gears = [ + gear.split("-")[1] + for gear in gears + if str(highest_level) in gear + ] + + # + # take only specific ones if there are several possibly duplicated ones + # (example: trawlers, trawlers|purse_seines) gears = reduce_to_specifics_with_multiples(gears, shiptype_dict) - ### get rid of '|' and take all possible gears individually - gears_split=[] + # + # get rid of '|' and take all possible gears individually + gears_split = [] for g in gears: - if '|' in g: - gears_split += g.split('|') + if "|" in g: + gears_split += g.split("|") else: gears_split.append(g) - - ### map geartype_dict to compare categories (broader ones to be removed) + + # + # map geartype_dict to compare categories (broader ones to be removed) gears = reduce_to_specifics(gears_split, shiptype_dict) - ### remove redundant values and join together with '|' + # + # remove redundant values and join together with '|' gears = sorted(list(set(gears))) - final_value = '|'.join(gears) - - ### check the case of combination of level 2 and 3 - if (highest_level==3)&(2 in levels): + final_value = "|".join(gears) + + # + # check the case of combination of level 2 and 3 + if (highest_level == 3) & (2 in levels): final_value_3 = determine_shiptype(gears_3, shiptype_dict) final_value_2 = determine_shiptype(gears_2, shiptype_dict) - if (not final_value in final_value_3)&(final_value in final_value_2): + if (final_value not in final_value_3) & (final_value in final_value_2): pass - else: + else: final_value = final_value_3 - - ### output - if final_value=='': + + # + # output + if final_value == "": return np.nan else: - final_value = str(highest_level) + '-' + final_value + final_value = str(highest_level) + "-" + final_value return final_value def select_high_confidence_geartype(x, y, shiptype_dict): - '''return a geartype that has higher confidence level''' - - if (x==x)&(x!=None)&(y==y)&(y!=None): - x_level = int(x.split('-')[0]) - x_value = x.split('-')[1] - y_level = int(y.split('-')[0]) - y_value = y.split('-')[1] - ## if x confidence level is higher, return x + """ + Return a geartype that has higher confidence level + + :param x: STRING, geartype attached with a confidence to compare + :param y: STRING, geartype attached with a confidence to compare + :param shiptype_dict: DICT, a geartype dictionary containing 'path' + info in the hierarchy + :return: STRING, geartype attached with a higher confidence between x and y + """ + + if (x == x) & (x is not None) & (y == y) & (y is not None): + x_level = int(x.split("-")[0]) + x_value = x.split("-")[1] + y_level = int(y.split("-")[0]) + y_value = y.split("-")[1] + # + # if x confidence level is higher, return x if x_level > y_level: return x - ## if confidence levels are the same, determine shiptype and return + # + # if confidence levels are the same, determine shiptype and return elif x_level == y_level: - return str(x_level) + '-' + determine_shiptype([x_value, y_value], shiptype_dict) - ## if y confidence level is higher, return y + return ( + str(x_level) + + "-" + + determine_shiptype([x_value, y_value], shiptype_dict) + ) + # + # if y confidence level is higher, return y else: return y - elif (x==x)&(x!=None): + elif (x == x) & (x is not None): return x - elif (y==y)&(y!=None): + elif (y == y) & (y is not None): return y else: return np.nan -### function that makes geartype dictionary from shiptypes yaml file def make_shiptype_dict(shiptypes): - ''' - This module returns a categorical dictionary of ship types from a ship type yml file received. - Values of the dictionary show where a specific ship type is situated in the ship type category tree. - - -------- - ARGUMENT - -------- - shiptypes: DICT, usually loaded from a .yml file that place categorically all possible ship types as a tree - -------- - - ------ - RETURN - ------ - shiptype_dict: DICT, shiptype categorical dictionary - (examples: (key, value) -> (set_longlines, (fishing, fixed_gear, set_longlines))) - ------ - ''' - - ### create a geartype dictionary where each gear has categorical information + """ + This module returns a categorical dictionary of ship types + from a ship type yml file received. Values of the dictionary show + where a specific ship type is situated in the ship type category tree. + + :param shiptypes: DICT, usually loaded from a .yml file that place + categorically all possible ship types as a tree + :return shiptype_dict: DICT, shiptype categorical dictionary + (examples: + (key, value) -> (set_longlines, (fishing, fixed_gear, set_longlines))) + """ + + # + # create a geartype dictionary where each gear has categorical information shiptype_dict = {} for stype in shiptypes: for l1 in shiptypes[stype]: @@ -233,143 +312,169 @@ def make_shiptype_dict(shiptypes): shiptype_dict[l3] = [stype, l1, l2, l3] if shiptypes[stype][l1][l2][l3] is not None: for l4 in shiptypes[stype][l1][l2][l3]: - shiptype_dict[l4] = [stype, l1, l2, l3, l4] - - ### other_fishing, other_not_fishing, unknown_fishing can be replaced by other more specific gears - shiptype_dict['fishing'] = ['fishing'] - shiptype_dict['non_fishing'] = ['non_fishing'] - shiptype_dict['unknown'] = None - shiptype_dict[''] = None - + shiptype_dict[l4] = [ + stype, + l1, + l2, + l3, + l4, + ] + + # + # other_fishing, other_not_fishing, unknown_fishing + # can be replaced by other more specific gears + shiptype_dict["fishing"] = ["fishing"] + shiptype_dict["non_fishing"] = ["non_fishing"] + shiptype_dict["unknown"] = None + shiptype_dict[""] = None + return shiptype_dict -### function to choose only specific gear values if broader level values exist with specific level values def reduce_to_specifics(gears, shiptype_dict): - ''' - this module reduces the list of gear values only to contain specific gear values if there are broader gear values together - - -------- - ARGUMENT - -------- - gears: LIST of strings that are gear types predefined - -------- - - ------ - RETURN - ------ - values: LIST of string that are gear types predefined - - ''' - if len(gears)==0: + """ + This module reduces the list of gear values only to contain specific + gear values if there are broader gear values together + + :param gears: LIST, list of strings that are gear types predefined + :param shiptype_dict: DICT, geartype dictionary containing 'path' + information in the hierarchy + :return: LIST of string that are gear types predefined + """ + if len(gears) == 0: return [] - - ### reduce only single gear values - singles = [gear for gear in gears if '|' not in gear] - multiples = [gear for gear in gears if '|' in gear] - - ### mapped to shiptype dictionary values - gears_mapped = [shiptype_dict[gear] for gear in singles if shiptype_dict[gear]!=None] - + + # + # reduce only single gear values + singles = [gear for gear in gears if "|" not in gear] + multiples = [gear for gear in gears if "|" in gear] + + # + # mapped to shiptype dictionary values + gears_mapped = [ + shiptype_dict[gear] + for gear in singles + if shiptype_dict[gear] is not None + ] + temp = list(gears_mapped) for gear in gears_mapped: - others = [g for g in gears_mapped if g!=gear] + others = [g for g in gears_mapped if g != gear] for other in others: - ### see if the gear in question is a subset of anyone of the others, if true, remove it from the list + # + # see if the gear in question is a subset of anyone of the others, + # if true, remove it from the list if set(gear).issubset(other): if gear in temp: temp.remove(gear) gears_mapped = temp - - ### return only end values as in a list + + # + # return only end values as in a list reduced = [] for gear in gears_mapped: val = gear[-1] reduced.append(val) reduced = list(set(reduced)) final = reduced + multiples - - return final + return final def reduce_to_specifics_with_multiples(gears, shiptype_dict): - if len(gears)==0: + """ + Same as the function above but accepting multiple gears attached with '|' + """ + if len(gears) == 0: return [] - - ### reduce singles to specifics if possible + + # + # reduce singles to specifics if possible gears = reduce_to_specifics(gears, shiptype_dict) - singles = [gear for gear in gears if '|' not in gear] - multiples = [gear for gear in gears if '|' in gear] - - if len(multiples)>0: + singles = [gear for gear in gears if "|" not in gear] + multiples = [gear for gear in gears if "|" in gear] + + if len(multiples) > 0: for multiple in multiples: - flags=[] - elems = multiple.split('|') - + flags = [] + elems = multiple.split("|") + for elem in elems: - ### look at elements of multiples if they can be reduced to specifics with single values - vals = [reduce_to_specifics([elem, single], shiptype_dict) for single in singles \ - if len(reduce_to_specifics([elem, single], shiptype_dict))==1] - if len(vals)==1: + # + # look at elements of multiples + # if they can be reduced to specifics + # with single values + vals = [ + reduce_to_specifics([elem, single], shiptype_dict) + for single in singles + if len(reduce_to_specifics([elem, single], shiptype_dict)) + == 1 + ] + if len(vals) == 1: flags.append(1) reduced = vals[0] else: flags.append(0) - ### if it can be reduced, then remove this multiple and put this reduced values - if sum(flags)==1: + # + # if it can be reduced, then remove this multiple + # and put this reduced values + if sum(flags) == 1: gears.remove(multiple) gears = gears + reduced - - ### final clearing-up + + # + # final clearing-up gears = reduce_to_specifics(gears, shiptype_dict) - + return gears def reduce_to_general(gears, shiptype_dict): - ''' - this module reduces the list of gear values only to contain general geartype values - - -------- - ARGUMENT - -------- - gears: LIST of strings that are gear types predefined - -------- - - ------ - RETURN - ------ - values: LIST of string that are gear types predefined + """ + This module reduces the list of gear values only to contain general + geartype values - ''' + :param gears: LIST, list of strings that are gear types predefined + :param shiptype_dict: DICT, geartype dictionary containing 'path' + information in the hierarchy + :return: LIST of string that are gear types predefined + """ - if len(gears)==0: + if len(gears) == 0: return [] - - ### reduce only single gear values - singles = [gear for gear in gears if '|' not in gear] - multiples = [gear for gear in gears if '|' in gear] - ### mapped to shiptype dictionary values - gears_mapped = [shiptype_dict[gear] for gear in singles if shiptype_dict[gear]!=None] + # + # reduce only single gear values + singles = [gear for gear in gears if "|" not in gear] + multiples = [gear for gear in gears if "|" in gear] + + # + # mapped to shiptype dictionary values + gears_mapped = [ + shiptype_dict[gear] + for gear in singles + if shiptype_dict[gear] is not None + ] temp = list(gears_mapped) for gear in gears_mapped: - others = [g for g in gears_mapped if g!=gear] + others = [g for g in gears_mapped if g != gear] for other in others: - ### see if anyone of the others is a subset of gear in question, if true, remove the gear (more detailed one) from the list + # + # see if anyone of the others is a subset of gear in question, + # if true, remove the gear (more detailed one) from the list if set(other).issubset(gear): if gear in temp: temp.remove(gear) - + gears_mapped = temp - - ### return only end values as in a list + + # + # return only end values as in a list reduced = [] for gear in gears_mapped: val = gear[-1] @@ -382,65 +487,94 @@ def reduce_to_general(gears, shiptype_dict): def reduce_to_general_with_multiples(gears, shiptype_dict): - ''' - returns general (less detailed) gear types only if gear values can be reduced according to shiptype yaml file - ''' - - if len(gears)==0: + """ + Returns general (less detailed) gear types + only if gear values can be reduced according to shiptype yaml file + + :param gears: LIST, list of strings that are gear types predefined + :param shiptype_dict: DICT, geartype dictionary containing 'path' + information in the hierarchy + :return: LIST of string that are gear types predefined + """ + + if len(gears) == 0: return [] - - ### reduce singles to specifics if possible + + # + # reduce singles to specifics if possible gears = reduce_to_general(gears, shiptype_dict) - singles = [gear for gear in gears if '|' not in gear] - multiples = [gear for gear in gears if '|' in gear] - - if len(multiples)>0: + singles = [gear for gear in gears if "|" not in gear] + multiples = [gear for gear in gears if "|" in gear] + + if len(multiples) > 0: for multiple in multiples: - flags=[] - elems = multiple.split('|') - + flags = [] + elems = multiple.split("|") + for elem in elems: - ### look at elements of multiples if they can be reduced to specifics with single values - vals = [reduce_to_general([elem, single], shiptype_dict) for single in singles \ - if len(reduce_to_general([elem, single], shiptype_dict))==1] - if len(vals)==1: + # + # look at elements of multiples if they can be reduced + # to specifics with single values + vals = [ + reduce_to_general([elem, single], shiptype_dict) + for single in singles + if len(reduce_to_general([elem, single], shiptype_dict)) + == 1 + ] + if len(vals) == 1: flags.append(1) reduced = vals[0] else: flags.append(0) - - ### if it can be reduced, then remove this multiple and put this reduced values - if sum(flags)>0: + + # + # if it can be reduced, then remove this multiple + # and put this reduced values + if sum(flags) > 0: gears.remove(multiple) gears = gears + reduced - ### final clearing-up + # + # final clearing-up gears = reduce_to_general(gears, shiptype_dict) - - return gears + return gears def is_fishing_vessel(gear, shiptype_dict): - if (gear=='')|(gear==None)|(gear!=gear): + """ + A function that determines if the given vessel class is a fishing vessel + + :param gear: LIST, list of strings that are gear types predefined + :param shiptype_dict: DICT, geartype dictionary containing 'path' + information in the hierarchy + :return: BOOL, whether the vessel is a fishing vessel + """ + if (gear == "") | (gear is None) | (gear != gear): return None else: - gear = gear.replace(' ','') - gear_mapped=[] - gears = gear.split('|') - - ## create a list of gears mapped to 0s (non-fishing gear) or 1s (fishing gear) + gear = gear.replace(" ", "") + gear_mapped = [] + gears = gear.split("|") + + # + # create a list of gears mapped to + # 0s (non-fishing gear) or 1s (fishing gear) for gear in gears: - if shiptype_dict[gear][0]=='fishing': + if shiptype_dict[gear][0] == "fishing": gear_mapped.append(1) else: gear_mapped.append(0) - if np.prod(gear_mapped)==1: ## if all mapped gears are 1s (therefore fishing vessel) + if ( + np.prod(gear_mapped) == 1 + ): # if all mapped gears are 1s (therefore fishing vessel) isfishingvessel = True - elif sum(gear_mapped)==0: ## if all mapped gears are 0s (therefore non-fishing vessel) + elif ( + sum(gear_mapped) == 0 + ): # if all mapped gears are 0s (therefore non-fishing vessel) isfishingvessel = False - else: ## not determinable, return None + else: # not determinable, return None return None - + return isfishingvessel diff --git a/shipdataprocess/standardize.py b/shipdataprocess/standardize.py index 67108c4..e2a41dc 100644 --- a/shipdataprocess/standardize.py +++ b/shipdataprocess/standardize.py @@ -5,10 +5,27 @@ import pandas as pd import numpy as np import re -from django.utils.encoding import smart_str from unidecode import unidecode +def smart_str(s): + """ + This module finds the right encoding of the given string + + :param s: STRING, a text in which we do not know the type of encoding + :return: STRING, standardized string + """ + if issubclass(type(s), str): + return s + if isinstance(s, bytes): + try: + str(s, "utf-8", "strict") + except UnicodeDecodeError: + str(s, "iso-8859-1", "strict") + else: + return str(s) + + def imo_checksum(n): """ This function for IMO numbers that are designed as 7-digit integer number @@ -32,12 +49,14 @@ def imo_checksum(n): # # IMO checksum formula - if ((n // 1000000 % 10) * 7 + - (n // 100000 % 10) * 6 + - (n // 10000 % 10) * 5 + - (n // 1000 % 10) * 4 + - (n // 100 % 10) * 3 + - (n // 10 % 10) * 2) % 10 == (n % 10): + if ( + (n // 1000000 % 10) * 7 + + (n // 100000 % 10) * 6 + + (n // 10000 % 10) * 5 + + (n // 1000 % 10) * 4 + + (n // 100 % 10) * 3 + + (n // 10 % 10) * 2 + ) % 10 == (n % 10): return True else: return False @@ -47,7 +66,7 @@ def standardize_imo(elem, check_field=True): """ Standardize IMO numbers (ignore all letters and characters but numbers) If it comes with pandas Series or DataFrame, make sure - it saves IMO numbers in STRING, as pandas Seires or DataFrame usually + it saves IMO numbers in STRING, as pandas Series or DataFrame usually turn INTEGER to FLOAT in the presence of NULL in the same column. :param elem: Pandas Series, Series that contains a string field @@ -58,69 +77,87 @@ def standardize_imo(elem, check_field=True): if check_field: if type(elem) == pd.core.series.Series: elem = elem.apply( - lambda x: re.sub(r'[^\d\.]', '', str(x)) - if (x == x) & (x is not None) & (x != '') & (x != 0) else None) + lambda x: re.sub(r"[^\d.]", "", str(x)) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else None + ) elem = elem.apply( lambda x: str(int(float(x))) - if (x == x) & (x is not None) & (x != '') & (x != 0) else None) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else None + ) elem = elem.apply(lambda x: x if imo_checksum(x) else None) return elem elif type(elem) == pd.core.frame.DataFrame: elem = elem[check_field].apply( - lambda x: re.sub(r'[^\d\.]', '', str(x)) - if (x == x) & (x is not None) & (x != '') & (x != 0) else None) + lambda x: re.sub(r"[^\d.]", "", str(x)) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else None + ) elem = elem.apply( lambda x: str(int(float(x))) - if (x == x) & (x is not None) & (x != '') & (x != 0) else None) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else None + ) elem = elem.apply(lambda x: x if imo_checksum(x) else None) return elem - elif (elem != elem) | (elem is None) | (elem == '') | (elem == 0): + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): return None elif (type(elem) == str) | (type(elem) == int) | (type(elem) == float): - elem = re.sub(r'[^\d\.]', '', str(elem)) + elem = re.sub(r"[^\d.]", "", str(elem)) if elem == "": return None else: elem = str(int(float(elem))) - if checksum(elem): + if imo_checksum(elem): return elem else: return None else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None -# -# Standardize floating numbers. -# Make sure to remove all comma separators (,). -# def standardize_float(elem, check_field=True): + """ + This module standardizes floating numbers. + Make sure to remove all comma separators (,). + + :param elem: Pandas Series, DataFrame, STR, FLOAT, INT, types + that contain a string field + :param check_field: Boolean, field that contains a float number + :return: Same type as the elem input + """ if check_field: if type(elem) == pd.core.series.Series: return elem.apply( - lambda x: float(str(x).replace(',', '')) - if (x == x) & (x is not None) & (x != '') & (x != 0) else np.nan) + lambda x: float(str(x).replace(",", "")) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else np.nan + ) elif type(elem) == pd.core.frame.DataFrame: return elem[check_field].apply( - lambda x: float(str(x).replace(',', '')) - if (x == x) & (x is not None) & (x != '') & (x != 0) else np.nan) - elif (elem != elem) | (elem is None) | (elem == '') | (elem == 0): + lambda x: float(str(x).replace(",", "")) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else np.nan + ) + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): return np.nan elif (type(elem) == str) | (type(elem) == int) | (type(elem) == float): - return float(str(elem).replace(',', '')) + return float(str(elem).replace(",", "")) else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return np.nan def smart_upper(text): """ - Selective upper sensitive to upper/lower cases - when it's related to URLs - Source: https://stackoverflow.com/questions/6038061/regular-expression-to-find-urls-within-a-string + Selective upper sensitive to upper/lower cases, particularly + when it's related to URLs, do not turn the URL to upper cases + Source: "https://stackoverflow.com/questions/6038061/ + regular-expression-to-find-urls-within-a-string" :param text: String, giv en text :return: String, Upper cased text except the URL part @@ -129,7 +166,10 @@ def smart_upper(text): # # Find URLs in the given string and upper-case only the other texts # to preserve caps of URLs - regex_for_url = r"((http|ftp|https)\:\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" + regex_for_url = ( + r"((http|ftp|https)\:\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))" + r"([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" + ) matched = re.finditer(regex_for_url, text) prev_end = 0 for m in matched: @@ -137,11 +177,9 @@ def smart_upper(text): start = m.start() end = m.end() - text = \ - text[:prev_end] + \ - text[prev_end:start].upper() + \ - url + \ - text[end:] + text = ( + text[:prev_end] + text[prev_end:start].upper() + url + text[end:] + ) prev_end = end text = text[:prev_end] + text[prev_end:].upper() @@ -163,158 +201,310 @@ def standardize_str(elem, check_field=True): if check_field: if type(elem) == pd.core.series.Series: elem = elem.apply( - lambda x: smart_upper(re.sub(r'\s+', ' ', smart_str(x)).strip()) - if (x == x) & (x is not None) & (x != '') else None) + lambda x: smart_upper( + re.sub(r"\s+", " ", smart_str(x)).strip() + ) + if (x == x) & (x is not None) & (x != "") + else None + ) return elem elif type(elem) == pd.core.frame.DataFrame: elem = elem[check_field].apply( - lambda x: smart_upper(re.sub(r'\s+', ' ', smart_str(x)).strip()) - if (x == x) & (x is not None) & (x != '') else None) + lambda x: smart_upper( + re.sub(r"\s+", " ", smart_str(x)).strip() + ) + if (x == x) & (x is not None) & (x != "") + else None + ) return elem - elif (elem != elem) | (elem is None) | (elem == '') | (elem == 0): + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): return None elif type(elem) == str: - return smart_upper(re.sub(r'\s+', ' ', elem).strip()) + return smart_upper(re.sub(r"\s+", " ", elem).strip()) else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None -# -# Standardize owner's names. Remove all variations of CO. LTD or similar types of suffixes -# and unionize all "fishery' to "fisheries". -# def standardize_owner(elem, check_field=True): + """ + This module standardizes owner's names which removes all variations of + suffix such as CO. LTD or similar types + and unionize "fishery' to "fisheries". + + :param elem: Pandas Series, DataFrame, STRING, a data type + that contains a string field + :param check_field: Boolean, field that contains the given strings + :return: + """ if check_field: elem = standardize_str(elem, check_field) - text_to_remove = \ - ['CO LTD', 'COLTD', 'COMPANY LTD', 'CO LIMITED', 'COMPANY LIMITED', 'CO LIMTED', 'CO LTTD', 'CV LIMITADA', - 'LTD SA($)', 'LTD S A($)', 'CO SA($)', 'CO S A($)', 'CO AB($)', 'CO A B($)', 'CO PTY LTD($)', 'CO LRD($)', - 'PTY LIMITED($)', 'PTY LTD($)', 'SA PTY LTD($)', 'CORP LTD($)', 'LTDA EPP($)', 'JOINT STOCK COMPANY($)', - 'JOINTSTOCK COMPANY($)', 'CORPORATION PTE LTD($)', 'CORPORATION PTE($)', 'CORP PTE($)', 'CORP SA($)', - 'CORP INC($)', 'CORPORATION($)', 'CORP($)', 'INCORPORATED($)', 'INC($)', 'AP PTE LTD', 'CO PTE LTD', - 'GMBH CO', 'GMBH($)', 'LTD($)', 'LTDA($)', 'LIMITED($)', 'PTE($)', 'LIMITADA($)', 'LDA($)', 'LLC($)', - 'COMPANY NV($)', 'COMPANY N V($)', 'COMPANY BV($)', 'COMPANY B V($)', 'CO BV($)', 'CO B V($)', 'CO NV($)', - 'CO N V($)', 'SA DE CV($)', 'S A DE C V($)', 'SCL DE CV($)', 'S C L DE C V($)', 'SCL($)', 'S C L($)', - 'S C DE R L($)', 'S R L DE C V($)', 'SAC($)', 'S A C($)', 'EIRL($)', 'E I R L($)', 'SRL($)', 'S R L($)', - ' CIA($)', 'EURL($)', '(^)EURL', 'SARL($)', '(^)SARL', 'SNC($)', '(^)SNC', 'SPC($)', '(^)SPC', 'SPA($)', - 'SAS($)', ' SA($)', ' S A($)', ' SL($)', ' S L($)', ' SC($)', ' S C($)', 'CO WLL($)', 'CO LIB($)', - ' AS($)', ' A S($)', 'PJSC($)', 'P JSC($)', 'OJSC($)', 'CJSC($)' 'JSC($)', ' EPP($)', ' CB($)', ' C B($)', - ' CA($)', ' C A($)', ' GIE($)', 'KABUSHIKI KAISHA($)', ' KK($)', 'K K($)', ' BV($)', ' B V($)', - 'YUGEN KAISHA', 'YUGEN', 'KAISHA', 'KAISYA', 'YUGEN KAISYA', 'GYOGYO', 'GYOGYOU', 'GAISHA', ' JU($)', - 'OOO($)', '(^)OOO', 'CO PVT($)', 'COMPANY PVT($)', ' PT($)', ' P T($)', '(^)PT', ' CC($)', - ' CO($)', 'COMPANY($)', ' NV($)', ' N V($)', '^NA($)', '^N A($)', 'RPTD SOLD.*', 'OWNER UNKNOWN*', - 'CO LT', 'EHF($)', '(^)EHF'] - text_to_remove = '|'.join(text_to_remove) + text_to_remove = [ + "CO LTD", + "COLTD", + "COMPANY LTD", + "CO LIMITED", + "COMPANY LIMITED", + "CO LIMTED", + "CO LTTD", + "CV LIMITADA", + "LTD SA($)", + "LTD S A($)", + "CO SA($)", + "CO S A($)", + "CO AB($)", + "CO A B($)", + "CO PTY LTD($)", + "CO LRD($)", + "PTY LIMITED($)", + "PTY LTD($)", + "SA PTY LTD($)", + "CORP LTD($)", + "LTDA EPP($)", + "JOINT STOCK COMPANY($)", + "JOINTSTOCK COMPANY($)", + "CORPORATION PTE LTD($)", + "CORPORATION PTE($)", + "CORP PTE($)", + "CORP SA($)", + "CORP INC($)", + "CORPORATION($)", + "CORP($)", + "INCORPORATED($)", + "INC($)", + "AP PTE LTD", + "CO PTE LTD", + "GMBH CO", + "GMBH($)", + "LTD($)", + "LTDA($)", + "LIMITED($)", + "PTE($)", + "LIMITADA($)", + "LDA($)", + "LLC($)", + "COMPANY NV($)", + "COMPANY N V($)", + "COMPANY BV($)", + "COMPANY B V($)", + "CO BV($)", + "CO B V($)", + "CO NV($)", + "CO N V($)", + "SA DE CV($)", + "S A DE C V($)", + "SCL DE CV($)", + "S C L DE C V($)", + "SCL($)", + "S C L($)", + "S C DE R L($)", + "S R L DE C V($)", + "SAC($)", + "S A C($)", + "EIRL($)", + "E I R L($)", + "SRL($)", + "S R L($)", + " CIA($)", + "EURL($)", + "(^)EURL", + "SARL($)", + "(^)SARL", + "SNC($)", + "(^)SNC", + "SPC($)", + "(^)SPC", + "SPA($)", + "SAS($)", + " SA($)", + " S A($)", + " SL($)", + " S L($)", + " SC($)", + " S C($)", + "CO WLL($)", + "CO LIB($)", + " AS($)", + " A S($)", + "PJSC($)", + "P JSC($)", + "OJSC($)", + "CJSC($)" "JSC($)", + " EPP($)", + " CB($)", + " C B($)", + " CA($)", + " C A($)", + " GIE($)", + "KABUSHIKI KAISHA($)", + " KK($)", + "K K($)", + " BV($)", + " B V($)", + "YUGEN KAISHA", + "YUGEN", + "KAISHA", + "KAISYA", + "YUGEN KAISYA", + "GYOGYO", + "GYOGYOU", + "GAISHA", + " JU($)", + "OOO($)", + "(^)OOO", + "CO PVT($)", + "COMPANY PVT($)", + " PT($)", + " P T($)", + "(^)PT", + " CC($)", + " CO($)", + "COMPANY($)", + " NV($)", + " N V($)", + "^NA($)", + "^N A($)", + "RPTD SOLD.*", + "OWNER UNKNOWN*", + "CO LT", + "EHF($)", + "(^)EHF", + ] + text_to_remove = "|".join(text_to_remove) if type(elem) == pd.core.series.Series: elem = elem.apply( - lambda x: unidecode(re.sub(r'\(.+\)', ' ', x)).strip() if (x == x) & (x != None) & (x != '') else None) + lambda x: unidecode(re.sub(r"\(.+\)", " ", x)).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem.apply( - lambda x: unidecode(re.sub(r'[^\w]+', ' ', x)).strip() if (x == x) & (x != None) & (x != '') else None) + lambda x: unidecode(re.sub(r"[^\w]+", " ", x)).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem.apply( - lambda x: re.sub(text_to_remove, ' ', x) if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub(text_to_remove, " ", x) + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem.apply( - lambda x: re.sub(r'\s+', ' ', x).strip() if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub(r"\s+", " ", x).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) return elem.apply( - lambda x: re.sub('FISHERY', 'FISHERIES', x) if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub("FISHERY", "FISHERIES", x) + if (x == x) & (x is not None) & (x != "") + else None + ) elif type(elem) == pd.core.frame.DataFrame: elem = elem[check_field].apply( - lambda x: unidecode(re.sub(r'\(.+\)', ' ', x)).strip() if (x == x) & (x != None) & (x != '') else None) + lambda x: unidecode(re.sub(r"\(.+\)", " ", x)).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem[check_field].apply( - lambda x: unidecode(re.sub(r'[^\w]+', ' ', x)).strip() if (x == x) & (x != None) & (x != '') else None) + lambda x: unidecode(re.sub(r"[^\w]+", " ", x)).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem[check_field].apply( - lambda x: re.sub(text_to_remove, ' ', x) if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub(text_to_remove, " ", x) + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem[check_field].apply( - lambda x: re.sub(r'\s+', ' ', x).strip() if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub(r"\s+", " ", x).strip() + if (x == x) & (x is not None) * (x != "") + else None + ) return elem[check_field].apply( - lambda x: re.sub('FISHERY', 'FISHERIES', x) if (x == x) & (x != None) * (x != '') else None) - elif (elem != elem) | (elem == None) | (elem == '') | (elem == 0): + lambda x: re.sub("FISHERY", "FISHERIES", x) + if (x == x) & (x is not None) * (x != "") + else None + ) + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): return np.nan elif type(elem) == str: - elem = unidecode(re.sub(r'\(.+\)', ' ', elem)).strip() - elem = unidecode(re.sub(r'[^\w]+', ' ', elem)).strip() - elem = re.sub(text_to_remove, ' ', elem) - elem = re.sub(r'\s+', ' ', elem).strip() - return re.sub('FISHERY', 'FISHERIES', elem) + elem = unidecode(re.sub(r"\(.+\)", " ", elem)).strip() + elem = unidecode(re.sub(r"[^\w]+", " ", elem)).strip() + elem = re.sub(text_to_remove, " ", elem) + elem = re.sub(r"\s+", " ", elem).strip() + return re.sub("FISHERY", "FISHERIES", elem) else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None -# -# Standardize Integer in a form of string -# because Pandas Series or DataFrame considers -# a column of integers with Nulls as a column of float -# Save it as a string column so that it can be uploaded -# as integer columns when uploading to BigQuery. -# def standardize_int_str(elem, check_field=True): + """ + This module standardizes an integer in the form of string + because Pandas Series or DataFrame considers a column of integers + with Nulls as a column of float. Save it as a string column so that + it can be uploaded as integer columns when uploading to BigQuery. + + :param elem: Pandas Series, DataFrame, STRING, INT, FLOAT, a data type + that contains a string field + :param check_field: Boolean, field that contains the given strings + :return: Same as the input elem type + """ if check_field: if type(elem) == pd.core.series.Series: return elem.apply( - lambda x: str(int(float(re.sub('[^\d\.]', '', str(x))))) - if (x == x) & (x is not None) & (x != '') else None) + lambda x: str(int(float(re.sub(r"[^\d.]", "", str(x))))) + if (x == x) & (x is not None) & (x != "") + else None + ) elif type(elem) == pd.core.frame.DataFrame: return elem[check_field].apply( - lambda x: str(int(float(re.sub('[^\d\.]', '', str(x))))) - if (x == x) & (x is not None) & (x != '') else None) - elif (elem != elem) | (elem is None) | (elem == ''): + lambda x: str(int(float(re.sub(r"[^\d.]", "", str(x))))) + if (x == x) & (x is not None) & (x != "") + else None + ) + elif (elem != elem) | (elem is None) | (elem == ""): return None elif (type(elem) == str) | (type(elem) == int) | (type(elem) == float): - return str(int(float(re.sub(r'[^\d\.]', '', str(elem))))) + return str(int(float(re.sub(r"[^\d.]", "", str(elem))))) else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None -# -# Standardize timestamp -# def standardize_time(elem, check_field=True): - if check_field: - if type(elem)==pd.core.series.Series: - return elem.apply(lambda x: pd.to_datetime(x, errors='coerce') if (x==x)&(x!=None)&(x!='') else None) - elif type(elem)==pd.core.frame.DataFrame: - return elem[check_field].apply(lambda x: pd.to_datetime(x, errors='coerce') if (x==x)&(x!=None)&(x!='') else None) - elif (elem!=elem)|(elem==None)|(elem=='')|(elem==0): - return np.nan - elif (type(elem)==str)|(type(elem)==pd.Timestamp): - return pd.to_datetime(elem, errors='coerce') - else: - raise ValueError('Unknown type received') - else: - return None - - -def clean_uvi(x): - if (type(x)==float)|(type(x)==int): - if (not np.isnan(x))&(x==x)&(x!=None): - return str(int(x)) - else: - return np.nan - else: - return re.sub('\s+', ' ', x).strip().upper() + """ + This modules standardizes a timestamp + :param elem: Pandas DATAFRAME, SERIES, STRING, a data type containing + time stamp information + :param check_field: Boolean, whether the field that contains + the timestamp information + :return: Same type as the elem input + """ -def standardize_uvi(elem, check_field=True): if check_field: - if type(elem)==pd.core.series.Series: - return elem.apply(lambda x: clean_uvi(x)) - elif type(elem)==pd.core.frame.DataFrame: - return elem[check_field].apply(lambda x: clean_uvi(x)) - elif (elem!=elem)|(elem==None)|(elem=='')|(elem==0): - return None - elif (type(elem)==int)|(type(elem)==float): - return str(int(elem)) - elif type(elem)==str: - return re.sub('\s+',' ',elem).strip().upper() + if type(elem) == pd.core.series.Series: + return elem.apply( + lambda x: pd.to_datetime(x, errors="coerce") + if (x == x) & (x is not None) & (x != "") + else None + ) + elif type(elem) == pd.core.frame.DataFrame: + return elem[check_field].apply( + lambda x: pd.to_datetime(x, errors="coerce") + if (x == x) & (x is not None) & (x != "") + else None + ) + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): + return np.nan + elif (type(elem) == str) | (type(elem) == pd.Timestamp): + return pd.to_datetime(elem, errors="coerce") else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None @@ -323,10 +513,10 @@ def standardize_flag(df, field, rules): """ Flag mapping based on YAML mapping file per registry - :param df: - :param field: - :param rules: - :return: + :param df: Pandas DataFrame, a dataframe containing flag information field + :param field: STRING, the name of the field containing flag information + :param rules: DICT, the YAML mapping rule + :return: Pandas Series or STRING """ if field: if rules: @@ -334,17 +524,19 @@ def standardize_flag(df, field, rules): # In case it's explicitly "ALL" as an option, # returns the preset value if "ALL" in rules: - return rules['ALL'] + return rules["ALL"] # # If it's "SAME" option, use the values in the flag field - elif 'SAME' in rules: + elif "SAME" in rules: return df[field] # # iso3 country code - note that all is turned to upper cases else: return df[field].apply( lambda x: rules[unidecode(str(x)).strip().upper()] - if (x == x) & (x is not None) & (x != '') else None) + if (x == x) & (x is not None) & (x != "") + else None + ) else: return None else: @@ -355,16 +547,17 @@ def standardize_geartype(df, field, rules): """ Geartype mapping based on YAML mapping file per registry - :param df: - :param field: - :param rules: - :return: + :param df: Pandas DataFrame, a DataFrame containing geartype + information field + :param field: STRING, the name of the field containing geartype information + :param rules: DICT, the YAML mapping rule + :return: Pandas Series or STRING """ if field: if rules: - if 'ALL' in rules: - return rules['ALL'] - elif 'SAME' in rules: + if "ALL" in rules: + return rules["ALL"] + elif "SAME" in rules: return df[field] # # note that when mapping geartype, @@ -372,8 +565,40 @@ def standardize_geartype(df, field, rules): else: return df[field].apply( lambda x: rules[unidecode(str(x)).strip().lower()] - if (x == x) & (x is not None) & (x != '') else None) + if (x == x) & (x is not None) & (x != "") + else None + ) else: return None else: return None + + +# +# Below is not used. +# def clean_uvi(x): +# if (type(x) == float) | (type(x) == int): +# if (not np.isnan(x)) & (x == x) & (x is not None): +# return str(int(x)) +# else: +# return np.nan +# else: +# return re.sub("\s+", " ", x).strip().upper() +# +# +# def standardize_uvi(elem, check_field=True): +# if check_field: +# if type(elem) == pd.core.series.Series: +# return elem.apply(lambda x: clean_uvi(x)) +# elif type(elem) == pd.core.frame.DataFrame: +# return elem[check_field].apply(lambda x: clean_uvi(x)) +# elif (elem != elem) | (elem == None) | (elem == "") | (elem == 0): +# return None +# elif (type(elem) == int) | (type(elem) == float): +# return str(int(elem)) +# elif type(elem) == str: +# return re.sub("\s+", " ", elem).strip().upper() +# else: +# raise ValueError("Unknown type received") +# else: +# return None diff --git a/tests/test_normalize_shipname.py b/tests/test_normalize_shipname.py index e7779f5..497fa15 100644 --- a/tests/test_normalize_shipname.py +++ b/tests/test_normalize_shipname.py @@ -1,49 +1,98 @@ from shipdataprocess.normalize import normalize_shipname + def test_normalize_shipname_none(): result = normalize_shipname(None) - assert result == None + assert result is None + def test_normalize_shipname_upcase(): result = normalize_shipname("MixEd") assert result == "MIXED" + +def test_normalize_shipname_num(): + result = normalize_shipname(123456) + assert result == "123456" + + +def test_normalize_shipname_float(): + result = normalize_shipname(123.456) + assert result is None + + def test_normalize_shipname_symbols(): result = normalize_shipname("weird -+%()<>$;!&'`\\.#/") assert result == "WEIRD" + def test_normalize_shipname_spaces(): result = normalize_shipname(" \tspaced \nname ") assert result == "SPACEDNAME" + def test_normalize_shipname_FB(): result = normalize_shipname("f/b boat f/v othername") assert result == "BOATOTHERNAME" + def test_normalize_shipname_RV(): result = normalize_shipname("r/v boat othername") assert result == "BOATOTHERNAME" + def test_normalize_shipname_nodot(): result = normalize_shipname("no. boat") assert result == "BOAT" + def test_normalize_shipname_nonumber(): result = normalize_shipname("no537 boat") - assert result == 'BOAT537' + assert result == "BOAT537" + def test_normalize_shipname_romans(): result = normalize_shipname("boat IX") assert result == "BOAT9" + def test_normalize_shipname_empty(): result = normalize_shipname("") - assert result == None + assert result is None + + +def test_normalize_shipname_empty_space(): + result = normalize_shipname(" ") + assert result is None + def test_normalize_shipname_1c(): result = normalize_shipname("a") assert result == "A" + def test_normalize_shipname_no(): result = normalize_shipname("no") assert result == "NO" + + +# +# Below are added in Jan 2022 for encoding tests +def test_normalize_shipname_utf8(): + result = normalize_shipname("ÆØÅæøå") + assert result == "AEOAAEOA" + + +def test_normalize_shipname_utf8_b(): + result = normalize_shipname("ÇÊÎŞÛ") + assert result == "CEISU" + + +def test_normalize_shipname_utf8_encoded(): + result = normalize_shipname(b"pyth\xc3\xb6n!") + assert result == "PYTHON" + + +def test_normalize_shipname_latin_encoded(): + result = normalize_shipname(b"\xe1") + assert result == "A" From 29a70d0152719fae42c9d7596b90936f0349f295 Mon Sep 17 00:00:00 2001 From: jaeyoonpark Date: Wed, 26 Jan 2022 13:30:35 +0100 Subject: [PATCH 2/4] update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5c63ab8..355a3ff 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ sdist/ var/ wheels/ *.egg-info/ +/*.egg-info/* .installed.cfg *.egg MANIFEST From 3bfc2bbfcc814ba25fba9a681ced179ee5e343cc Mon Sep 17 00:00:00 2001 From: jaeyoonpark Date: Wed, 26 Jan 2022 14:05:25 +0100 Subject: [PATCH 3/4] update 0.7.0 --- CHANGES.md | 3 +- build/lib/shipdataprocess/__init__.py | 10 +- build/lib/shipdataprocess/collapse.py | 192 +++++-- build/lib/shipdataprocess/normalize.py | 325 ++++++----- build/lib/shipdataprocess/shiptype.py | 658 ++++++++++++++--------- build/lib/shipdataprocess/standardize.py | 529 ++++++++++++------ shipdataprocess/__init__.py | 2 +- 7 files changed, 1106 insertions(+), 613 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index e6711a1..2129f6f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -38,5 +38,4 @@ v0.6.15, 2020-11-06 -- Fix a bug in normalize_shipname() and normalize_callsign( v0.6.16, 2020-11-26 -- Make smart_upper() to capture multiple URLs not to capitalize them v0.6.17, 2021-07-30 -- Add Indonesian prefix and Chinese HAO v0.6.18, 2021-08-04 -- Fix a bug in normalize_callsign() regarding NULL/NONE - - +v0.7.0, 2022-01-26 -- Fix it to work only in Python 3.6 or above, codes are compliant with PEP8, dependencies are clearer (Django removed) diff --git a/build/lib/shipdataprocess/__init__.py b/build/lib/shipdataprocess/__init__.py index 7b322e5..444767d 100644 --- a/build/lib/shipdataprocess/__init__.py +++ b/build/lib/shipdataprocess/__init__.py @@ -3,15 +3,15 @@ """ -__version__ = '0.6.18' -__author__ = 'Jaeyoon Park' -__email__ = 'jaeyoon.park13@gmail.com' -__source__ = 'https://github.com/GlobalFishingWatch/shipdataprocess' +__version__ = "0.7.0" +__author__ = "Jaeyoon Park" +__email__ = "jaeyoon@globalfishingwatch.org" +__source__ = "https://github.com/GlobalFishingWatch/shipdataprocess" __license__ = """ Copyright 2017 Global Fishing Watch Inc. Authors: -Jaeyoon Park +Jaeyoon Park Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/build/lib/shipdataprocess/collapse.py b/build/lib/shipdataprocess/collapse.py index b18a0ce..823a580 100644 --- a/build/lib/shipdataprocess/collapse.py +++ b/build/lib/shipdataprocess/collapse.py @@ -1,32 +1,51 @@ +""" +This file contains functions that help collapse (multiple) rows +for each vessel found in the process of producing Global Fishing Watch's +internal vessel database. + +Last updated: 2022-01-24 +Jaeyoon Park +""" + import pandas as pd import numpy as np import re from collections import Counter -### helper functions for collapsing rows by vessel - def non_zero_mean(x): try: - x = x[(x!=0)&(x!=None)] - if len(x)==0: return 0.0 - else: return x.mean() - except: + x = x[(x != 0) & (x is not None)] + if len(x) == 0: + return 0.0 + else: + return x.mean() + except AttributeError: return 0.0 - + + def non_zero_std(x): try: - x = x[(x!=0)&(x!=None)] - if len(x)<2: return 0.0 - else: return x.std() - except: + x = x[(x != 0) & (x is not None)] + if len(x) < 2: + return 0.0 + else: + return x.std() + except AttributeError: return 0.0 - -def most_common_value(x): ## remove if standard deviation is too big compared to mean value of all numbers - '''remove if standard deviation is too big compared to mean value of all numbers''' - if (type(x)==list)&(len(x)>0): + + +def most_common_value(x): + """ + Remove if standard deviation is too big compared to mean value of + all numbers. The standard deviation threshold is set to be 10%. + + x: Pandas Series or list, a list of numerical values + (for length, tonnage, engine power) + """ + if (type(x) == list) & (len(x) > 0): x = pd.Series(x) - if (type(x)==pd.core.series.Series)&(len(x.dropna())>0): + if (type(x) == pd.core.series.Series) & (len(x.dropna()) > 0): x_mean = non_zero_mean(x) x_std = non_zero_std(x) if x_std > x_mean * 0.1: @@ -36,95 +55,156 @@ def most_common_value(x): ## remove if standard deviation is too big compared to else: return np.nan + def most_common_value_with_confidence(cx): - '''same functionality as most_common_value() but with confidence level taken account''' - if (type(cx)==pd.core.series.Series)&(len(cx)>0): - if len(cx.dropna())==0: + """ + same functionality as most_common_value() but with confidence level + taken into account + + cx: Pandas Series or list, a list of numerical values + (for length, tonnage, engine power) + with a confidence level indicator attached with '-' in front of the value. + """ + if (type(cx) == pd.core.series.Series) & (len(cx) > 0): + if len(cx.dropna()) == 0: return np.nan else: cx = list(cx.values) - if (type(cx)==list)&(len(cx)>0): - clist = [int(elem.split('-')[0]) for elem in cx if (elem==elem)&(elem!=None)] - xlist = [elem for elem in cx if (elem==elem)&(elem!=None)] - if len(clist)>0: + if (type(cx) == list) & (len(cx) > 0): + clist = [ + int(elem.split("-")[0]) + for elem in cx + if (elem == elem) & (elem is not None) + ] + xlist = [elem for elem in cx if (elem == elem) & (elem is not None)] + if len(clist) > 0: max_c = max(clist) - x = [float(elem.split('-')[1]) for elem in xlist if int(elem.split('-')[0])==max_c] + x = [ + float(elem.split("-")[1]) + for elem in xlist + if int(elem.split("-")[0]) == max_c + ] + # Call the function to return the most common value return most_common_value(x) else: return np.nan else: return np.nan - -def most_common_num(x): ## mostly for imo collapsing + + +def most_common_num(x): + """ + Return the most common number (mostly for imo collapsing). + + x: Pandas Series, a list of numbers + """ try: x = x.dropna() - if len(x)==0: + if len(x) == 0: return np.nan else: vals = x.values - vs = [v for v in vals if (v!=0)] - #vs = list(set(vs)) - if len(vs)==0: + vs = [v for v in vals if (v != 0)] + # vs = list(set(vs)) + if len(vs) == 0: return np.nan - else: + else: data = Counter(vs) return max(vs, key=data.get) - except: + except AttributeError: return np.nan - + + def most_common_str(x): + """ + Return the most common string. + + x: Pandas Series, a list of values in string + """ try: x = x.dropna() - if len(x)==0: + if len(x) == 0: return np.nan else: - vals = x.values - vs = [re.sub('\s+',' ',str(v)).strip().upper() for v in x.values] - vs = [v for v in vs if v!=''] - #vs = list(set(vs)) - if len(vs)==0: + vs = [ + re.sub(r"\s+", " ", str(v)).strip().upper() for v in x.values + ] + vs = [v for v in vs if v != ""] + # vs = list(set(vs)) + if len(vs) == 0: return np.nan else: data = Counter(vs) return max(vs, key=data.get) - #if len(vs)==1: - # return vs[0] - #else: - # return None - except: + except AttributeError: return np.nan - -def str_attached(x): ## join all strings + + +def str_attached(x): + """ + Return all strings joined. If the values are in numbers, convert them + to string and combined. + + :param x: Pandas Series or list + :return: A joined string + """ try: x = x.dropna() - if len(x)==0: + if len(x) == 0: return np.nan else: - x = x.apply(lambda v: str(int(v)) if (type(v)==float)|(type(v)==int)|(type(v)==long) else v) + x = x.apply( + lambda v: str(int(v)) + if (type(v) == float) | (type(v) == int) + else v + ) vals = x.values.tolist() - #vs = [str(v).strip() for v in vals if (v==v)&(v!=None)&(v!='')] - #vs = [v for v in vs if (v!='')] + # vs = [str(v).strip() for v in vals if (v==v)&(v!=None)&(v!='')] + # vs = [v for v in vs if (v!='')] vs = list(set(vals)) - return ', '.join(sorted(vs)) - except: + return ", ".join(sorted(vs)) + except AttributeError: return np.nan - + + def min_time(x): + """ + Return the minimum time + + :param x: Pandas Series + :return: Timestamp + """ vals = x.values - vs = [v for v in vals if (v==v)&(v!=None)&(v!='')] + vs = [v for v in vals if (v == v) & (v is not None) & (v != "")] vs = pd.Series(vs) + return vs.min() + def max_time(x): + """ + Return the maximum time + + :param x: Pandas Series + :return: Timestamp + """ vals = x.values - vs = [v for v in vals if (v==v)&(v!=None)&(v!='')] + vs = [v for v in vals if (v == v) & (v is not None) & (v != "")] vs = pd.Series(vs) + return vs.max() + def highest_confidence(x): + """ + Return the maximum confidence if none return 1 (the lowest). + + :param x: Pandas Series or list + :return: Integer + """ x = x.dropna() - if len(x)>0: + if len(x) > 0: return max(x.tolist()) else: return 1 diff --git a/build/lib/shipdataprocess/normalize.py b/build/lib/shipdataprocess/normalize.py index 29104a9..c95a2cd 100644 --- a/build/lib/shipdataprocess/normalize.py +++ b/build/lib/shipdataprocess/normalize.py @@ -1,205 +1,260 @@ +""" +This file provides functions that normalize ship name and call sign of a vessel +either recorded in registries or in vessel tracking data. The normalization, or +standardization of string, will ensure that the strings are comparable to other +strings despite various ways of recording names of the same vessel. +It also removes all non-essential characters or white spaces. +""" from unidecode import unidecode import roman import re -import sys def normalize_shipname(name): - - if (name is None)|(name != name)|(name == ''): - return None + """ + Return a normalized ship name by removing all non-essential characters, + prefix, and suffix, and standardizing roman numerals or other parts + of the vessel name. - # - # Turn to upper cases - name = name.upper() - + :param name: String, an original vessel name + :return: String, a normalized vessel name + """ + if (name is None) | (name != name) | (name == ""): + return None + print(name) # # Remove nasty characters and white spaces - if sys.version_info[0] < 3: + # try: + # name = unidecode(str(name.decode("utf-8"))) + # except UnicodeDecodeError: + # name = unidecode(str(name.decode("iso_8859-1"))) + + if issubclass(type(name), str): + name = unidecode(name) + elif isinstance(name, bytes): try: - name = unidecode(str(name)) # get rid of nasty characters, but sometimes this fails - except: - try: - name = unidecode(str(name.decode('utf8'))) - except: - name = unidecode(str(name.decode('iso_8859-1'))) + name = unidecode(str(name, "utf-8", "strict")) + except UnicodeDecodeError: + name = unidecode(str(name, "iso-8859-1", "strict")) + elif isinstance(name, int): + name = str(name) else: - name = unidecode(str(name)) + return None + print(name) + # + # Turn to upper cases + name = name.upper() - name = re.sub('\s+',' ',name) + name = re.sub(r"\s+", " ", name) name = name.strip() - name = name.replace('\n','').replace('\r','') - + name = name.replace("\n", "").replace("\r", "") + # # Remove fishing vessel code - name = re.sub('MFV[^\w]+', ' ', name) # fishing vessel code in English - name = re.sub('MPV[^\w]+', ' ', name) # fishing vessel code in English - name = re.sub('HMS[^\w]+', ' ', name) # fishing vessel code in English - name = re.sub('LPG[/|C]*[\W]*|LNG[/|C]*[\W]*', ' ', name) # LPG/LNG variations - - name = re.sub('(\s|^)F[^\w\s]*V[^\w]*', ' ', name) # fishing vessel code in English (F/V, F-V, F.V, FV: etc) - name = re.sub('^F[^\w\s]*B[^\w]+', ' ', name) # fishing vessel code in English - name = re.sub(' F[^\w\s]*B[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]*P[^\w]+', ' ', name) # fishing vessel code in Italy/Spain - name = re.sub(' M[^\w\s]*P[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]*B[^\w]+', ' ', name) # fishing vessel code in Italy/Spain - name = re.sub(' M[^\w\s]*B[^\w]*(\s|$)', ' ', name) - name = re.sub('^G[^\w\s]*V[^\w]+', ' ', name) # mostly in UK - name = re.sub('S+F+[^\w]+G[^\w\s]*V[^\w]*', ' ', name) - name = re.sub(' G[^\w\s]*V[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]*V[^\w]+', ' ', name) # in English - name = re.sub(' M[^\w\s]*V[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]+S[^\w]+', ' ', name) # Merchant Ship - name = re.sub(' M[^\w\s]+S[^\w]*(\s|$)', ' ', name) - name = re.sub('^M[^\w\s]*K[^\w]+', ' ', name) # mostly in northern europe - name = re.sub(' M[^\w\s]+K[^\w]*(\s|$)', ' ', name) - name = re.sub('^R[^\w\s]*V[^\w]+', ' ', name) # Research Vessel - name = re.sub(' R[^\w\s]*V[^\w]*(\s|$)', ' ', name) - - name = re.sub('^T[^\w\s]*T[^\w]+', ' ', name) # Tender To - name = re.sub(' T[^\w\s]*T[^\w]*($)', ' ', name) - name = re.sub('^S[^\w\s]*Y[^\w]+', ' ', name) # Steam Yacht - name = re.sub(' S[^\w\s]*Y[^\w]*($)', ' ', name) - name = re.sub('^M[^\w\s]*F[^\w]+', ' ', name) # Motor Ferry - name = re.sub(' M[^\w\s]*F[^\w]*($)', ' ', name) - name = re.sub('^S[^\w\s]*S[^\w]+', ' ', name) # Steam Ship - name = re.sub(' S[^\w\s]*S[^\w]*($)', ' ', name) - name = re.sub('^S[^\w\s]*V[^\w]+', ' ', name) # Sailing Vessel - name = re.sub(' S[^\w\s]*V[^\w]*($)', ' ', name) - name = re.sub('^M[^\w\s]*T[^\w]+', ' ', name) # Motor Tanker - name = re.sub(' M[^\w\s]*T[^\w]*($)', ' ', name) - name = re.sub('^M[^\w\s]+Y[^\w]+', ' ', name) # Motor Yacht - name = re.sub(' M[^\w\s]+Y[^\w]*($)', ' ', name) - name = re.sub('^[A-Z]/[A-Z][^\w]+', ' ', name) # All other types of X/X - name = re.sub(' [A-Z]/[A-Z]($)', ' ', name) - name = re.sub('^[A-Z]\\\\[A-Z][^\w]+', ' ', name) ## All other types of X\X - name = re.sub(' [A-Z]\\\\[A-Z]($)', ' ', name) - name = re.sub('^KM[^\w]+', ' ', name) # Indonesia K.M - name = re.sub('^E.B. ', ' ', name) # Dutch E.B. equivalent to NO. - - name = re.sub('\(.+\)', ' ', name) # All additional information in parentheses - name = re.sub('\[.+\]', ' ', name) - + name = re.sub(r"MFV[^\w]+", " ", name) # fishing vessel code in English + name = re.sub(r"MPV[^\w]+", " ", name) # fishing vessel code in English + name = re.sub(r"HMS[^\w]+", " ", name) # fishing vessel code in English + name = re.sub( + r"LPG[/|C]*[\W]*|LNG[/|C]*[\W]*", " ", name + ) # LPG/LNG variations + + name = re.sub( + r"(\s|^)F[^\w\s]*V[^\w]*", " ", name + ) # fishing vessel code in English (F/V, F-V, F.V, FV: etc) + name = re.sub( + r"^F[^\w\s]*B[^\w]+", " ", name + ) # fishing vessel code in English + name = re.sub(r" F[^\w\s]*B[^\w]*(\s|$)", " ", name) + name = re.sub( + r"^M[^\w\s]*P[^\w]+", " ", name + ) # fishing vessel code in Italy/Spain + name = re.sub(r" M[^\w\s]*P[^\w]*(\s|$)", " ", name) + name = re.sub( + r"^M[^\w\s]*B[^\w]+", " ", name + ) # fishing vessel code in Italy/Spain + name = re.sub(r" M[^\w\s]*B[^\w]*(\s|$)", " ", name) + name = re.sub(r"^G[^\w\s]*V[^\w]+", " ", name) # mostly in UK + name = re.sub(r"S+F+[^\w]+G[^\w\s]*V[^\w]*", " ", name) + name = re.sub(r" G[^\w\s]*V[^\w]*(\s|$)", " ", name) + name = re.sub(r"^M[^\w\s]*V[^\w]+", " ", name) # in English + name = re.sub(r" M[^\w\s]*V[^\w]*(\s|$)", " ", name) + name = re.sub(r"^M[^\w\s]+S[^\w]+", " ", name) # Merchant Ship + name = re.sub(r" M[^\w\s]+S[^\w]*(\s|$)", " ", name) + name = re.sub(r"^M[^\w\s]*K[^\w]+", " ", name) # mostly in northern europe + name = re.sub(r" M[^\w\s]+K[^\w]*(\s|$)", " ", name) + name = re.sub(r"^R[^\w\s]*V[^\w]+", " ", name) # Research Vessel + name = re.sub(r" R[^\w\s]*V[^\w]*(\s|$)", " ", name) + + name = re.sub(r"^T[^\w\s]*T[^\w]+", " ", name) # Tender To + name = re.sub(r" T[^\w\s]*T[^\w]*($)", " ", name) + name = re.sub(r"^S[^\w\s]*Y[^\w]+", " ", name) # Steam Yacht + name = re.sub(r" S[^\w\s]*Y[^\w]*($)", " ", name) + name = re.sub(r"^M[^\w\s]*F[^\w]+", " ", name) # Motor Ferry + name = re.sub(r" M[^\w\s]*F[^\w]*($)", " ", name) + name = re.sub(r"^S[^\w\s]*S[^\w]+", " ", name) # Steam Ship + name = re.sub(r" S[^\w\s]*S[^\w]*($)", " ", name) + name = re.sub(r"^S[^\w\s]*V[^\w]+", " ", name) # Sailing Vessel + name = re.sub(r" S[^\w\s]*V[^\w]*($)", " ", name) + name = re.sub(r"^M[^\w\s]*T[^\w]+", " ", name) # Motor Tanker + name = re.sub(r" M[^\w\s]*T[^\w]*($)", " ", name) + name = re.sub(r"^M[^\w\s]+Y[^\w]+", " ", name) # Motor Yacht + name = re.sub(r" M[^\w\s]+Y[^\w]*($)", " ", name) + name = re.sub(r"^[A-Z]/[A-Z][^\w]+", " ", name) # All other types of X/X + name = re.sub(r" [A-Z]/[A-Z]($)", " ", name) + name = re.sub( + r"^[A-Z]\\\\[A-Z][^\w]+", " ", name + ) # All other types of X\X + name = re.sub(r" [A-Z]\\\\[A-Z]($)", " ", name) + name = re.sub(r"^KM[^\w]+", " ", name) # Indonesia K.M + name = re.sub(r"^E.B. ", " ", name) # Dutch E.B. equivalent to NO. + + name = re.sub( + r"\(.+\)", " ", name + ) # All additional information in parentheses + name = re.sub(r"\[.+\]", " ", name) + # # Numbers in letters - name = re.sub(' ONE($)| UNO($)| UN($)', ' 1', name) - name = re.sub(' TWO($)| DOS($)| DEUX($)', ' 2', name) - name = re.sub(' THREE($)| TRES($)| TROIS($)', ' 3', name) - name = re.sub(' FOUR($)| CUATRO($)| QUATRE($)', ' 4', name) - name = re.sub(' FIVE($)| CINCO($)| CINQ($)', ' 5', name) - name = re.sub(' SIX($)| SEIS($)| SIX($)', ' 6', name) - name = re.sub(' SEVEN($)| SIETE($)| SEPT($)', ' 7', name) - name = re.sub(' EIGHT($)| OCHO($)| HUIT($)', ' 8', name) - name = re.sub(' NINE($)| NUEVE($)| NEUF($)', ' 9', name) - name = re.sub(' TEN($)| DIEZ($)| DIX($)', ' 10', name) - name = re.sub(' ELEVEN($)| ONCE($)| ONZE($)', ' 11', name) - name = re.sub(' TWELVE($)| DOCE($)| DOUZE($)', ' 12', name) - name = re.sub(' THIRTEEN($)| TRECE($)| TREIZE($)', ' 13', name) - name = re.sub(' FOURTEEN($)| CATORCE($)| QUATORZE($)', ' 14', name) - name = re.sub(' FIFTEEN($)| QUINCE($)| QUINZE($)', ' 15', name) - - name = re.sub('1ST ', 'FIRST ', name) - name = re.sub('2ND ', 'SECOND ', name) - name = re.sub('3RD ', 'THIRD ', name) - name = re.sub('4TH ', 'FOURTH ', name) - name = re.sub('5TH ', 'FIFTH ', name) + name = re.sub(r" ONE($)| UNO($)| UN($)", " 1", name) + name = re.sub(r" TWO($)| DOS($)| DEUX($)", " 2", name) + name = re.sub(r" THREE($)| TRES($)| TROIS($)", " 3", name) + name = re.sub(r" FOUR($)| CUATRO($)| QUATRE($)", " 4", name) + name = re.sub(r" FIVE($)| CINCO($)| CINQ($)", " 5", name) + name = re.sub(r" SIX($)| SEIS($)", " 6", name) + name = re.sub(r" SEVEN($)| SIETE($)| SEPT($)", " 7", name) + name = re.sub(r" EIGHT($)| OCHO($)| HUIT($)", " 8", name) + name = re.sub(r" NINE($)| NUEVE($)| NEUF($)", " 9", name) + name = re.sub(r" TEN($)| DIEZ($)| DIX($)", " 10", name) + name = re.sub(r" ELEVEN($)| ONCE($)| ONZE($)", " 11", name) + name = re.sub(r" TWELVE($)| DOCE($)| DOUZE($)", " 12", name) + name = re.sub(r" THIRTEEN($)| TRECE($)| TREIZE($)", " 13", name) + name = re.sub(r" FOURTEEN($)| CATORCE($)| QUATORZE($)", " 14", name) + name = re.sub(r" FIFTEEN($)| QUINCE($)| QUINZE($)", " 15", name) + + name = re.sub("1ST ", "FIRST ", name) + name = re.sub("2ND ", "SECOND ", name) + name = re.sub("3RD ", "THIRD ", name) + name = re.sub("4TH ", "FOURTH ", name) + name = re.sub("5TH ", "FIFTH ", name) # # Country specific appendix (S. Korea and China) - name = re.sub('\d+\s*HO($)', ' ', name) - name = re.sub('\d+\s*HAO($)', ' ', name) + name = re.sub(r"\d+\s*HO($)", " ", name) + name = re.sub(r"\d+\s*HAO($)", " ", name) # # Remove NO.s such in NO.5, NO5, NO:5, NO. 5, NO 5, N5, N-5 etc - name = re.sub('NO[^\w\s]*[\s]*(?=\d+)', '', name) - name = re.sub('[\s]+N[\W_0]*(?=\d+)', '', name) - name = re.sub('NO\.\s*(?=[^0-9]+)', '', name) - + name = re.sub(r"NO[^\w\s]*[\s]*(?=\d+)", "", name) + name = re.sub(r"[\s]+N[\W_0]*(?=\d+)", "", name) + name = re.sub(r"NO\.\s*(?=[^0-9]+)", "", name) + # # Turn '&' to 'AND' - name = re.sub('(?<=[A-Z])\s+&\s+(?=[A-Z])', ' AND ', name) # replace 'BLACK & WHITE' to 'BLACK AND WHITE' - + name = re.sub( + r"(?<=[A-Z])\s+&\s+(?=[A-Z])", " AND ", name + ) # replace 'BLACK & WHITE' to 'BLACK AND WHITE' + # # Deromanization - vs = re.split('\s+|-|(?<=[A-Z]{3})\.',name) + vs = re.split(r"\s+|-|(?<=[A-Z]{3})\.", name) try: # # If last word from the name text has L/C/D/M then do not deromanize - if re.search('[LCDM]', vs[-1]).group(0): pass - except: + if re.search(r"[LCDM]", vs[-1]).group(0): + pass + except AttributeError: # # Try to deromanize the last word from the name text try: vs[-1] = roman.fromRoman(vs[-1]) vs[-1] = str(int(vs[-1])) - except: + except roman.InvalidRomanNumeralError: + # + # No corresponding roman numeral found. Let's leave it as is. pass - + # # Attach the deromanized digits to the end - name = ''.join(vs) + name = "".join(vs) - # # Now, remove all special characters - name = re.sub('[\W_]', '', name) - + name = re.sub(r"[\W_]", "", name) + # # Check if the name starts with digits, if yes move it to the end - try: - first_digit = re.search('^\d+', name).group(0) - name = re.sub('^\d+', '', name) + str(first_digit) - except: - pass + obj = re.search(r"^\d+", name) + if obj: + first_digit = obj.group(0) + name = re.sub(r"^\d+", "", name) + str(first_digit) # # Remove 0s from the numbers starting with 0s - try: - last_digit = re.search('\d+$', name).group(0) - non_zeros = re.sub('^0+', '', last_digit) - name = re.sub('\d+$', '', name) + str(non_zeros) - except: - pass + obj = re.search(r"\d+$", name) + if obj: + last_digit = obj.group(0) + non_zeros = re.sub("^0+", "", last_digit) + name = re.sub(r"\d+$", "", name) + str(non_zeros) + + # + # Remove all excessive white spaces + name = re.sub(r"\s+", " ", name) - if name == '': + if name == "" or name == " ": return None - - return name + else: + return name def normalize_callsign(callsign): + """ + Return a normalized International Radio Call Sign by removing non-essential + characters and ignoring meaningless call sign including 'NONE', 'UNKNOWN' + + :param callsign: String, an original call sign + :return: String, a normalized call sign + """ - if (callsign is None) | (callsign != callsign) | (callsign == '') | \ - (callsign == "NONE") | (callsign == "UNKNOWN") | (callsign == "NIL") | (callsign == "NULL"): + if ( + (callsign is None) + | (callsign != callsign) + | (callsign == "") + | (callsign == "NONE") + | (callsign == "UNKNOWN") + | (callsign == "NIL") + | (callsign == "NULL") + ): return None # # Turn to upper cases callsign = callsign.upper() - + # # Remove nasty characters, white space try: - callsign = unidecode(str(callsign)) # get rid of nasty characters, but sometimes this fails - except: + # + # get rid of nasty characters, but sometimes this fails + callsign = unidecode(str(callsign)) + except UnicodeDecodeError: try: - callsign = unidecode(str(callsign.decode('utf8'))) - except: - callsign = unidecode(str(callsign.decode('iso_8859-1'))) + callsign = unidecode(str(callsign.decode("utf8"))) + except UnicodeDecodeError: + callsign = unidecode(str(callsign.decode("iso_8859-1"))) callsign = callsign.strip() - callsign = re.sub('\s+',' ',callsign) + callsign = re.sub(r"\s+", " ", callsign) # # Get rid of all non-word characters - callsign = re.sub('[\W_]', '', callsign) - + callsign = re.sub(r"[\W_]", "", callsign) + # # Remove 0s from callsign starting with 0s - callsign = re.sub('^0+', '', callsign) - - if callsign == '': - return None + callsign = re.sub(r"^0+", "", callsign) - return callsign \ No newline at end of file + if callsign == "": + return None + else: + return callsign diff --git a/build/lib/shipdataprocess/shiptype.py b/build/lib/shipdataprocess/shiptype.py index 70eff72..8ee5773 100644 --- a/build/lib/shipdataprocess/shiptype.py +++ b/build/lib/shipdataprocess/shiptype.py @@ -1,226 +1,305 @@ -import pandas as pd +""" +This file provides functions that process operations with regard to vessel +types defined by Global Fishing Watch (There are about 40 ship types +pre-defined). See here +https://globalfishingwatch.org/datasets-and-code-vessel-identity/ + +Last updates: 2022-01-25 +Jaeyoon Park +""" import numpy as np - def determine_shiptype(gears, shiptype_dict): - ''' - determinte_shiptype module receives multiple types of ship and returns the most specific ship type. - - -------- - ARGUMENT - -------- - gears: SERIES, LIST, OR STR, single or multiple combination of ship types joined by '|' (OR) - (examples: fixed_gear|set_longlines, cargo) - -------- - - ------ - RETURN - ------ - STR or None, select the most detailed type among the ship types received if they are all in one category, + """ + This module receives multiple types of ship and returns the most specific + ship type in the pre-defined vessel classification hierarchy. + https://globalfishingwatch.org/datasets-and-code-vessel-identity/ + + :param gears: SERIES, LIST, OR STR, single or multiple combination of ship + type joined by '|' (OR) (examples: fixed_gear|set_longlines, cargo) + :param shiptype_dict: DICT, a geartype dictionary containing 'path' + information in the vessel class hierarchy + :return: STR or None, select the most detailed type among the ship types + received if they are all in one category, otherwise a combination of ship types. - (examples: fixed_gear|set_longlines -> set_longlines, trawler|fixed_gear|set_longlines -> trawler|set_longlines) - ------ - ''' - + (examples: fixed_gear|set_longlines -> set_longlines, + trawler|fixed_gear|set_longlines -> trawler|set_longlines) + """ - ## if there is no information on gears, then return None - if len(gears)==0: + # + # if there is no information on gears, then return None + if len(gears) == 0: return None - - ### make sure the entry is a list of strings - if type(gears)==str: + + # + # make sure the entry is a list of strings + if type(gears) == str: gears = [gears] - elif type(gears)==list: + elif type(gears) == list: pass - else: gears = gears.tolist() - - ### remove Nones - gears = [gear.replace(' ','').strip() for gear in gears if (gear!=None)&(gear==gear)&(gear!='')] - - ### take only specific ones if there are several possibly duplicated ones (example: trawlers, trawlers|purse_seines) + else: + gears = gears.tolist() + + # + # remove Nones + gears = [ + gear.replace(" ", "").strip() + for gear in gears + if (gear is not None) & (gear == gear) & (gear != "") + ] + + # + # take only specific ones if there are several possibly duplicated ones + # (example: trawlers, trawlers|purse_seines) gears = reduce_to_specifics_with_multiples(gears, shiptype_dict) - ### get rid of '|' and take all possible gears individually - gears_split=[] + # + # get rid of '|' and take all possible gears individually + gears_split = [] for g in gears: - if '|' in g: - gears_split += g.split('|') + if "|" in g: + gears_split += g.split("|") else: gears_split.append(g) - - ### map geartype_dict to compare categories (broader ones to be removed) + + # + # map geartype_dict to compare categories + # (broader/ ones to be removed) gears = reduce_to_specifics(gears_split, shiptype_dict) - ### remove redundant values and join together with '|' + # + # remove redundant values and join together with '|' gears = sorted(list(set(gears))) - final_value = '|'.join(gears) - if final_value=='': + final_value = "|".join(gears) + if final_value == "": return None else: return final_value - def determine_shiptype_simple(gears, shiptype_dict): - ''' - same as determinte_shiptype module but without reducing multiple gears to specific (this is for testing). - ''' + """ + same as determine_shiptype module but without reducing multiple gears + to specific (this is for testing). + + :param gears: SERIES, LIST, OR STR, single or multiple combination of + ship types joined by '|' (OR) (examples: fixed_gear|set_longlines, cargo) + :param shiptype_dict: DICT, ship type dictionary containing 'path' of + gear type in the hierarchy + :return: STR or None, select the most detailed type among the ship types + received if they are all in one category, + otherwise a combination of ship types. + (examples: fixed_gear|set_longlines -> set_longlines, + trawler|fixed_gear|set_longlines -> trawler|set_longlines) + """ - ## if there is no information on gears, then return None - if len(gears)==0: + # + # if there is no information on gears, then return None + if len(gears) == 0: return None - - ### make sure the entry is a list of strings - if type(gears)==str: + + # + # make sure the entry is a list of strings + if type(gears) == str: gears = [gears] - elif type(gears)==list: + elif type(gears) == list: pass - else: gears = gears.tolist() - - ### remove Nones - gears = [gear.replace(' ','').strip() for gear in gears if (gear!=None)&(gear==gear)&(gear!='')] - - ### get rid of '|' and take all possible gears individually - gears_split=[] + else: + gears = gears.tolist() + + # + # remove Nones + gears = [ + gear.replace(" ", "").strip() + for gear in gears + if (gear is not None) & (gear == gear) & (gear != "") + ] + + # + # get rid of '|' and take all possible gears individually + gears_split = [] for g in gears: - if '|' in g: - gears_split += g.split('|') + if "|" in g: + gears_split += g.split("|") else: gears_split.append(g) - - ### map geartype_dict to compare categories (broader ones to be removed) + + # + # map geartype_dict to compare categories (broader ones to be removed) gears = reduce_to_specifics(gears_split, shiptype_dict) - ### remove redundant values and join together with '|' + # + # remove redundant values and join together with '|' gears = sorted(list(set(gears))) - final_value = '|'.join(gears) - if final_value=='': + final_value = "|".join(gears) + if final_value == "": return None else: return final_value def tag_confidence_level(x, c): - if (x==x)&(x!=None)&(x!=0)&(x!=''): - return str(c) + '-' + str(x) + """ + Helper function to add confidence level to geartype + + :param x: STRING, geartype + :param c: INT, confidence level (1 to 4) + :return: STRING, geartype attached with confidence level by a dash ('-') + """ + if (x == x) & (x is not None) & (x != 0) & (x != ""): + return str(c) + "-" + str(x) else: return np.nan def determine_shiptype_with_confidence(gears, shiptype_dict): - ''' - same as determine_shiptype but with confidence level taken into account - ''' - - ## if there is no information on gears, then return None - if len(gears)==0: + """ + same as the determine_shiptype module above + but with confidence level taken into account + """ + + # + # if there is no information on gears, then return None + if len(gears) == 0: return np.nan - - ### make sure the entry is a list of strings - if type(gears)==str: + + # + # make sure the entry is a list of strings + if type(gears) == str: gears = [gears] - elif type(gears)==list: + elif type(gears) == list: pass - else: gears = gears.tolist() - - ### remove NaN/None - gears = [gear.replace(' ','').strip() for gear in gears if (gear!=None)&(gear==gear)&(gear!='')] - if len(gears)==0: + else: + gears = gears.tolist() + + # + # remove NaN/None + gears = [ + gear.replace(" ", "").strip() + for gear in gears + if (gear is not None) & (gear == gear) & (gear != "") + ] + if len(gears) == 0: return np.nan - - ### remove all gear values from lists of less confidence level - levels = [int(gear.split('-')[0]) for gear in gears] - if len(levels)>0: + + # + # remove all gear values from lists of less confidence level + levels = [int(gear.split("-")[0]) for gear in gears] + if len(levels) > 0: highest_level = max(levels) - if (highest_level==3)&(2 in levels): - gears_3 = [gear.split('-')[1] for gear in gears if ('3' in gear)] - gears_2 = [gear.split('-')[1] for gear in gears if ('2' in gear)] - gears = [gear.split('-')[1] for gear in gears if ('2' in gear)|('3' in gear)] + if (highest_level == 3) & (2 in levels): + gears_3 = [gear.split("-")[1] for gear in gears if ("3" in gear)] + gears_2 = [gear.split("-")[1] for gear in gears if ("2" in gear)] + gears = [ + gear.split("-")[1] + for gear in gears + if ("2" in gear) | ("3" in gear) + ] else: - gears = [gear.split('-')[1] for gear in gears if str(highest_level) in gear] - - ### take only specific ones if there are several possibly duplicated ones (example: trawlers, trawlers|purse_seines) + gears = [ + gear.split("-")[1] + for gear in gears + if str(highest_level) in gear + ] + + # + # take only specific ones if there are several possibly duplicated ones + # (example: trawlers, trawlers|purse_seines) gears = reduce_to_specifics_with_multiples(gears, shiptype_dict) - ### get rid of '|' and take all possible gears individually - gears_split=[] + # + # get rid of '|' and take all possible gears individually + gears_split = [] for g in gears: - if '|' in g: - gears_split += g.split('|') + if "|" in g: + gears_split += g.split("|") else: gears_split.append(g) - - ### map geartype_dict to compare categories (broader ones to be removed) + + # + # map geartype_dict to compare categories (broader ones to be removed) gears = reduce_to_specifics(gears_split, shiptype_dict) - ### remove redundant values and join together with '|' + # + # remove redundant values and join together with '|' gears = sorted(list(set(gears))) - final_value = '|'.join(gears) - - ### check the case of combination of level 2 and 3 - if (highest_level==3)&(2 in levels): + final_value = "|".join(gears) + + # + # check the case of combination of level 2 and 3 + if (highest_level == 3) & (2 in levels): final_value_3 = determine_shiptype(gears_3, shiptype_dict) final_value_2 = determine_shiptype(gears_2, shiptype_dict) - if (not final_value in final_value_3)&(final_value in final_value_2): + if (final_value not in final_value_3) & (final_value in final_value_2): pass - else: + else: final_value = final_value_3 - - ### output - if final_value=='': + + # + # output + if final_value == "": return np.nan else: - final_value = str(highest_level) + '-' + final_value + final_value = str(highest_level) + "-" + final_value return final_value def select_high_confidence_geartype(x, y, shiptype_dict): - '''return a geartype that has higher confidence level''' - - if (x==x)&(x!=None)&(y==y)&(y!=None): - x_level = int(x.split('-')[0]) - x_value = x.split('-')[1] - y_level = int(y.split('-')[0]) - y_value = y.split('-')[1] - ## if x confidence level is higher, return x + """ + Return a geartype that has higher confidence level + + :param x: STRING, geartype attached with a confidence to compare + :param y: STRING, geartype attached with a confidence to compare + :param shiptype_dict: DICT, a geartype dictionary containing 'path' + info in the hierarchy + :return: STRING, geartype attached with a higher confidence between x and y + """ + + if (x == x) & (x is not None) & (y == y) & (y is not None): + x_level = int(x.split("-")[0]) + x_value = x.split("-")[1] + y_level = int(y.split("-")[0]) + y_value = y.split("-")[1] + # + # if x confidence level is higher, return x if x_level > y_level: return x - ## if confidence levels are the same, determine shiptype and return + # + # if confidence levels are the same, determine shiptype and return elif x_level == y_level: - return str(x_level) + '-' + determine_shiptype([x_value, y_value], shiptype_dict) - ## if y confidence level is higher, return y + return ( + str(x_level) + + "-" + + determine_shiptype([x_value, y_value], shiptype_dict) + ) + # + # if y confidence level is higher, return y else: return y - elif (x==x)&(x!=None): + elif (x == x) & (x is not None): return x - elif (y==y)&(y!=None): + elif (y == y) & (y is not None): return y else: return np.nan -### function that makes geartype dictionary from shiptypes yaml file def make_shiptype_dict(shiptypes): - ''' - This module returns a categorical dictionary of ship types from a ship type yml file received. - Values of the dictionary show where a specific ship type is situated in the ship type category tree. - - -------- - ARGUMENT - -------- - shiptypes: DICT, usually loaded from a .yml file that place categorically all possible ship types as a tree - -------- - - ------ - RETURN - ------ - shiptype_dict: DICT, shiptype categorical dictionary - (examples: (key, value) -> (set_longlines, (fishing, fixed_gear, set_longlines))) - ------ - ''' - - ### create a geartype dictionary where each gear has categorical information + """ + This module returns a categorical dictionary of ship types + from a ship type yml file received. Values of the dictionary show + where a specific ship type is situated in the ship type category tree. + + :param shiptypes: DICT, usually loaded from a .yml file that place + categorically all possible ship types as a tree + :return shiptype_dict: DICT, shiptype categorical dictionary + (examples: + (key, value) -> (set_longlines, (fishing, fixed_gear, set_longlines))) + """ + + # + # create a geartype dictionary where each gear has categorical information shiptype_dict = {} for stype in shiptypes: for l1 in shiptypes[stype]: @@ -233,143 +312,169 @@ def make_shiptype_dict(shiptypes): shiptype_dict[l3] = [stype, l1, l2, l3] if shiptypes[stype][l1][l2][l3] is not None: for l4 in shiptypes[stype][l1][l2][l3]: - shiptype_dict[l4] = [stype, l1, l2, l3, l4] - - ### other_fishing, other_not_fishing, unknown_fishing can be replaced by other more specific gears - shiptype_dict['fishing'] = ['fishing'] - shiptype_dict['non_fishing'] = ['non_fishing'] - shiptype_dict['unknown'] = None - shiptype_dict[''] = None - + shiptype_dict[l4] = [ + stype, + l1, + l2, + l3, + l4, + ] + + # + # other_fishing, other_not_fishing, unknown_fishing + # can be replaced by other more specific gears + shiptype_dict["fishing"] = ["fishing"] + shiptype_dict["non_fishing"] = ["non_fishing"] + shiptype_dict["unknown"] = None + shiptype_dict[""] = None + return shiptype_dict -### function to choose only specific gear values if broader level values exist with specific level values def reduce_to_specifics(gears, shiptype_dict): - ''' - this module reduces the list of gear values only to contain specific gear values if there are broader gear values together - - -------- - ARGUMENT - -------- - gears: LIST of strings that are gear types predefined - -------- - - ------ - RETURN - ------ - values: LIST of string that are gear types predefined - - ''' - if len(gears)==0: + """ + This module reduces the list of gear values only to contain specific + gear values if there are broader gear values together + + :param gears: LIST, list of strings that are gear types predefined + :param shiptype_dict: DICT, geartype dictionary containing 'path' + information in the hierarchy + :return: LIST of string that are gear types predefined + """ + if len(gears) == 0: return [] - - ### reduce only single gear values - singles = [gear for gear in gears if '|' not in gear] - multiples = [gear for gear in gears if '|' in gear] - - ### mapped to shiptype dictionary values - gears_mapped = [shiptype_dict[gear] for gear in singles if shiptype_dict[gear]!=None] - + + # + # reduce only single gear values + singles = [gear for gear in gears if "|" not in gear] + multiples = [gear for gear in gears if "|" in gear] + + # + # mapped to shiptype dictionary values + gears_mapped = [ + shiptype_dict[gear] + for gear in singles + if shiptype_dict[gear] is not None + ] + temp = list(gears_mapped) for gear in gears_mapped: - others = [g for g in gears_mapped if g!=gear] + others = [g for g in gears_mapped if g != gear] for other in others: - ### see if the gear in question is a subset of anyone of the others, if true, remove it from the list + # + # see if the gear in question is a subset of anyone of the others, + # if true, remove it from the list if set(gear).issubset(other): if gear in temp: temp.remove(gear) gears_mapped = temp - - ### return only end values as in a list + + # + # return only end values as in a list reduced = [] for gear in gears_mapped: val = gear[-1] reduced.append(val) reduced = list(set(reduced)) final = reduced + multiples - - return final + return final def reduce_to_specifics_with_multiples(gears, shiptype_dict): - if len(gears)==0: + """ + Same as the function above but accepting multiple gears attached with '|' + """ + if len(gears) == 0: return [] - - ### reduce singles to specifics if possible + + # + # reduce singles to specifics if possible gears = reduce_to_specifics(gears, shiptype_dict) - singles = [gear for gear in gears if '|' not in gear] - multiples = [gear for gear in gears if '|' in gear] - - if len(multiples)>0: + singles = [gear for gear in gears if "|" not in gear] + multiples = [gear for gear in gears if "|" in gear] + + if len(multiples) > 0: for multiple in multiples: - flags=[] - elems = multiple.split('|') - + flags = [] + elems = multiple.split("|") + for elem in elems: - ### look at elements of multiples if they can be reduced to specifics with single values - vals = [reduce_to_specifics([elem, single], shiptype_dict) for single in singles \ - if len(reduce_to_specifics([elem, single], shiptype_dict))==1] - if len(vals)==1: + # + # look at elements of multiples + # if they can be reduced to specifics + # with single values + vals = [ + reduce_to_specifics([elem, single], shiptype_dict) + for single in singles + if len(reduce_to_specifics([elem, single], shiptype_dict)) + == 1 + ] + if len(vals) == 1: flags.append(1) reduced = vals[0] else: flags.append(0) - ### if it can be reduced, then remove this multiple and put this reduced values - if sum(flags)==1: + # + # if it can be reduced, then remove this multiple + # and put this reduced values + if sum(flags) == 1: gears.remove(multiple) gears = gears + reduced - - ### final clearing-up + + # + # final clearing-up gears = reduce_to_specifics(gears, shiptype_dict) - + return gears def reduce_to_general(gears, shiptype_dict): - ''' - this module reduces the list of gear values only to contain general geartype values - - -------- - ARGUMENT - -------- - gears: LIST of strings that are gear types predefined - -------- - - ------ - RETURN - ------ - values: LIST of string that are gear types predefined + """ + This module reduces the list of gear values only to contain general + geartype values - ''' + :param gears: LIST, list of strings that are gear types predefined + :param shiptype_dict: DICT, geartype dictionary containing 'path' + information in the hierarchy + :return: LIST of string that are gear types predefined + """ - if len(gears)==0: + if len(gears) == 0: return [] - - ### reduce only single gear values - singles = [gear for gear in gears if '|' not in gear] - multiples = [gear for gear in gears if '|' in gear] - ### mapped to shiptype dictionary values - gears_mapped = [shiptype_dict[gear] for gear in singles if shiptype_dict[gear]!=None] + # + # reduce only single gear values + singles = [gear for gear in gears if "|" not in gear] + multiples = [gear for gear in gears if "|" in gear] + + # + # mapped to shiptype dictionary values + gears_mapped = [ + shiptype_dict[gear] + for gear in singles + if shiptype_dict[gear] is not None + ] temp = list(gears_mapped) for gear in gears_mapped: - others = [g for g in gears_mapped if g!=gear] + others = [g for g in gears_mapped if g != gear] for other in others: - ### see if anyone of the others is a subset of gear in question, if true, remove the gear (more detailed one) from the list + # + # see if anyone of the others is a subset of gear in question, + # if true, remove the gear (more detailed one) from the list if set(other).issubset(gear): if gear in temp: temp.remove(gear) - + gears_mapped = temp - - ### return only end values as in a list + + # + # return only end values as in a list reduced = [] for gear in gears_mapped: val = gear[-1] @@ -382,65 +487,94 @@ def reduce_to_general(gears, shiptype_dict): def reduce_to_general_with_multiples(gears, shiptype_dict): - ''' - returns general (less detailed) gear types only if gear values can be reduced according to shiptype yaml file - ''' - - if len(gears)==0: + """ + Returns general (less detailed) gear types + only if gear values can be reduced according to shiptype yaml file + + :param gears: LIST, list of strings that are gear types predefined + :param shiptype_dict: DICT, geartype dictionary containing 'path' + information in the hierarchy + :return: LIST of string that are gear types predefined + """ + + if len(gears) == 0: return [] - - ### reduce singles to specifics if possible + + # + # reduce singles to specifics if possible gears = reduce_to_general(gears, shiptype_dict) - singles = [gear for gear in gears if '|' not in gear] - multiples = [gear for gear in gears if '|' in gear] - - if len(multiples)>0: + singles = [gear for gear in gears if "|" not in gear] + multiples = [gear for gear in gears if "|" in gear] + + if len(multiples) > 0: for multiple in multiples: - flags=[] - elems = multiple.split('|') - + flags = [] + elems = multiple.split("|") + for elem in elems: - ### look at elements of multiples if they can be reduced to specifics with single values - vals = [reduce_to_general([elem, single], shiptype_dict) for single in singles \ - if len(reduce_to_general([elem, single], shiptype_dict))==1] - if len(vals)==1: + # + # look at elements of multiples if they can be reduced + # to specifics with single values + vals = [ + reduce_to_general([elem, single], shiptype_dict) + for single in singles + if len(reduce_to_general([elem, single], shiptype_dict)) + == 1 + ] + if len(vals) == 1: flags.append(1) reduced = vals[0] else: flags.append(0) - - ### if it can be reduced, then remove this multiple and put this reduced values - if sum(flags)>0: + + # + # if it can be reduced, then remove this multiple + # and put this reduced values + if sum(flags) > 0: gears.remove(multiple) gears = gears + reduced - ### final clearing-up + # + # final clearing-up gears = reduce_to_general(gears, shiptype_dict) - - return gears + return gears def is_fishing_vessel(gear, shiptype_dict): - if (gear=='')|(gear==None)|(gear!=gear): + """ + A function that determines if the given vessel class is a fishing vessel + + :param gear: LIST, list of strings that are gear types predefined + :param shiptype_dict: DICT, geartype dictionary containing 'path' + information in the hierarchy + :return: BOOL, whether the vessel is a fishing vessel + """ + if (gear == "") | (gear is None) | (gear != gear): return None else: - gear = gear.replace(' ','') - gear_mapped=[] - gears = gear.split('|') - - ## create a list of gears mapped to 0s (non-fishing gear) or 1s (fishing gear) + gear = gear.replace(" ", "") + gear_mapped = [] + gears = gear.split("|") + + # + # create a list of gears mapped to + # 0s (non-fishing gear) or 1s (fishing gear) for gear in gears: - if shiptype_dict[gear][0]=='fishing': + if shiptype_dict[gear][0] == "fishing": gear_mapped.append(1) else: gear_mapped.append(0) - if np.prod(gear_mapped)==1: ## if all mapped gears are 1s (therefore fishing vessel) + if ( + np.prod(gear_mapped) == 1 + ): # if all mapped gears are 1s (therefore fishing vessel) isfishingvessel = True - elif sum(gear_mapped)==0: ## if all mapped gears are 0s (therefore non-fishing vessel) + elif ( + sum(gear_mapped) == 0 + ): # if all mapped gears are 0s (therefore non-fishing vessel) isfishingvessel = False - else: ## not determinable, return None + else: # not determinable, return None return None - + return isfishingvessel diff --git a/build/lib/shipdataprocess/standardize.py b/build/lib/shipdataprocess/standardize.py index 67108c4..e2a41dc 100644 --- a/build/lib/shipdataprocess/standardize.py +++ b/build/lib/shipdataprocess/standardize.py @@ -5,10 +5,27 @@ import pandas as pd import numpy as np import re -from django.utils.encoding import smart_str from unidecode import unidecode +def smart_str(s): + """ + This module finds the right encoding of the given string + + :param s: STRING, a text in which we do not know the type of encoding + :return: STRING, standardized string + """ + if issubclass(type(s), str): + return s + if isinstance(s, bytes): + try: + str(s, "utf-8", "strict") + except UnicodeDecodeError: + str(s, "iso-8859-1", "strict") + else: + return str(s) + + def imo_checksum(n): """ This function for IMO numbers that are designed as 7-digit integer number @@ -32,12 +49,14 @@ def imo_checksum(n): # # IMO checksum formula - if ((n // 1000000 % 10) * 7 + - (n // 100000 % 10) * 6 + - (n // 10000 % 10) * 5 + - (n // 1000 % 10) * 4 + - (n // 100 % 10) * 3 + - (n // 10 % 10) * 2) % 10 == (n % 10): + if ( + (n // 1000000 % 10) * 7 + + (n // 100000 % 10) * 6 + + (n // 10000 % 10) * 5 + + (n // 1000 % 10) * 4 + + (n // 100 % 10) * 3 + + (n // 10 % 10) * 2 + ) % 10 == (n % 10): return True else: return False @@ -47,7 +66,7 @@ def standardize_imo(elem, check_field=True): """ Standardize IMO numbers (ignore all letters and characters but numbers) If it comes with pandas Series or DataFrame, make sure - it saves IMO numbers in STRING, as pandas Seires or DataFrame usually + it saves IMO numbers in STRING, as pandas Series or DataFrame usually turn INTEGER to FLOAT in the presence of NULL in the same column. :param elem: Pandas Series, Series that contains a string field @@ -58,69 +77,87 @@ def standardize_imo(elem, check_field=True): if check_field: if type(elem) == pd.core.series.Series: elem = elem.apply( - lambda x: re.sub(r'[^\d\.]', '', str(x)) - if (x == x) & (x is not None) & (x != '') & (x != 0) else None) + lambda x: re.sub(r"[^\d.]", "", str(x)) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else None + ) elem = elem.apply( lambda x: str(int(float(x))) - if (x == x) & (x is not None) & (x != '') & (x != 0) else None) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else None + ) elem = elem.apply(lambda x: x if imo_checksum(x) else None) return elem elif type(elem) == pd.core.frame.DataFrame: elem = elem[check_field].apply( - lambda x: re.sub(r'[^\d\.]', '', str(x)) - if (x == x) & (x is not None) & (x != '') & (x != 0) else None) + lambda x: re.sub(r"[^\d.]", "", str(x)) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else None + ) elem = elem.apply( lambda x: str(int(float(x))) - if (x == x) & (x is not None) & (x != '') & (x != 0) else None) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else None + ) elem = elem.apply(lambda x: x if imo_checksum(x) else None) return elem - elif (elem != elem) | (elem is None) | (elem == '') | (elem == 0): + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): return None elif (type(elem) == str) | (type(elem) == int) | (type(elem) == float): - elem = re.sub(r'[^\d\.]', '', str(elem)) + elem = re.sub(r"[^\d.]", "", str(elem)) if elem == "": return None else: elem = str(int(float(elem))) - if checksum(elem): + if imo_checksum(elem): return elem else: return None else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None -# -# Standardize floating numbers. -# Make sure to remove all comma separators (,). -# def standardize_float(elem, check_field=True): + """ + This module standardizes floating numbers. + Make sure to remove all comma separators (,). + + :param elem: Pandas Series, DataFrame, STR, FLOAT, INT, types + that contain a string field + :param check_field: Boolean, field that contains a float number + :return: Same type as the elem input + """ if check_field: if type(elem) == pd.core.series.Series: return elem.apply( - lambda x: float(str(x).replace(',', '')) - if (x == x) & (x is not None) & (x != '') & (x != 0) else np.nan) + lambda x: float(str(x).replace(",", "")) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else np.nan + ) elif type(elem) == pd.core.frame.DataFrame: return elem[check_field].apply( - lambda x: float(str(x).replace(',', '')) - if (x == x) & (x is not None) & (x != '') & (x != 0) else np.nan) - elif (elem != elem) | (elem is None) | (elem == '') | (elem == 0): + lambda x: float(str(x).replace(",", "")) + if (x == x) & (x is not None) & (x != "") & (x != 0) + else np.nan + ) + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): return np.nan elif (type(elem) == str) | (type(elem) == int) | (type(elem) == float): - return float(str(elem).replace(',', '')) + return float(str(elem).replace(",", "")) else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return np.nan def smart_upper(text): """ - Selective upper sensitive to upper/lower cases - when it's related to URLs - Source: https://stackoverflow.com/questions/6038061/regular-expression-to-find-urls-within-a-string + Selective upper sensitive to upper/lower cases, particularly + when it's related to URLs, do not turn the URL to upper cases + Source: "https://stackoverflow.com/questions/6038061/ + regular-expression-to-find-urls-within-a-string" :param text: String, giv en text :return: String, Upper cased text except the URL part @@ -129,7 +166,10 @@ def smart_upper(text): # # Find URLs in the given string and upper-case only the other texts # to preserve caps of URLs - regex_for_url = r"((http|ftp|https)\:\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" + regex_for_url = ( + r"((http|ftp|https)\:\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))" + r"([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" + ) matched = re.finditer(regex_for_url, text) prev_end = 0 for m in matched: @@ -137,11 +177,9 @@ def smart_upper(text): start = m.start() end = m.end() - text = \ - text[:prev_end] + \ - text[prev_end:start].upper() + \ - url + \ - text[end:] + text = ( + text[:prev_end] + text[prev_end:start].upper() + url + text[end:] + ) prev_end = end text = text[:prev_end] + text[prev_end:].upper() @@ -163,158 +201,310 @@ def standardize_str(elem, check_field=True): if check_field: if type(elem) == pd.core.series.Series: elem = elem.apply( - lambda x: smart_upper(re.sub(r'\s+', ' ', smart_str(x)).strip()) - if (x == x) & (x is not None) & (x != '') else None) + lambda x: smart_upper( + re.sub(r"\s+", " ", smart_str(x)).strip() + ) + if (x == x) & (x is not None) & (x != "") + else None + ) return elem elif type(elem) == pd.core.frame.DataFrame: elem = elem[check_field].apply( - lambda x: smart_upper(re.sub(r'\s+', ' ', smart_str(x)).strip()) - if (x == x) & (x is not None) & (x != '') else None) + lambda x: smart_upper( + re.sub(r"\s+", " ", smart_str(x)).strip() + ) + if (x == x) & (x is not None) & (x != "") + else None + ) return elem - elif (elem != elem) | (elem is None) | (elem == '') | (elem == 0): + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): return None elif type(elem) == str: - return smart_upper(re.sub(r'\s+', ' ', elem).strip()) + return smart_upper(re.sub(r"\s+", " ", elem).strip()) else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None -# -# Standardize owner's names. Remove all variations of CO. LTD or similar types of suffixes -# and unionize all "fishery' to "fisheries". -# def standardize_owner(elem, check_field=True): + """ + This module standardizes owner's names which removes all variations of + suffix such as CO. LTD or similar types + and unionize "fishery' to "fisheries". + + :param elem: Pandas Series, DataFrame, STRING, a data type + that contains a string field + :param check_field: Boolean, field that contains the given strings + :return: + """ if check_field: elem = standardize_str(elem, check_field) - text_to_remove = \ - ['CO LTD', 'COLTD', 'COMPANY LTD', 'CO LIMITED', 'COMPANY LIMITED', 'CO LIMTED', 'CO LTTD', 'CV LIMITADA', - 'LTD SA($)', 'LTD S A($)', 'CO SA($)', 'CO S A($)', 'CO AB($)', 'CO A B($)', 'CO PTY LTD($)', 'CO LRD($)', - 'PTY LIMITED($)', 'PTY LTD($)', 'SA PTY LTD($)', 'CORP LTD($)', 'LTDA EPP($)', 'JOINT STOCK COMPANY($)', - 'JOINTSTOCK COMPANY($)', 'CORPORATION PTE LTD($)', 'CORPORATION PTE($)', 'CORP PTE($)', 'CORP SA($)', - 'CORP INC($)', 'CORPORATION($)', 'CORP($)', 'INCORPORATED($)', 'INC($)', 'AP PTE LTD', 'CO PTE LTD', - 'GMBH CO', 'GMBH($)', 'LTD($)', 'LTDA($)', 'LIMITED($)', 'PTE($)', 'LIMITADA($)', 'LDA($)', 'LLC($)', - 'COMPANY NV($)', 'COMPANY N V($)', 'COMPANY BV($)', 'COMPANY B V($)', 'CO BV($)', 'CO B V($)', 'CO NV($)', - 'CO N V($)', 'SA DE CV($)', 'S A DE C V($)', 'SCL DE CV($)', 'S C L DE C V($)', 'SCL($)', 'S C L($)', - 'S C DE R L($)', 'S R L DE C V($)', 'SAC($)', 'S A C($)', 'EIRL($)', 'E I R L($)', 'SRL($)', 'S R L($)', - ' CIA($)', 'EURL($)', '(^)EURL', 'SARL($)', '(^)SARL', 'SNC($)', '(^)SNC', 'SPC($)', '(^)SPC', 'SPA($)', - 'SAS($)', ' SA($)', ' S A($)', ' SL($)', ' S L($)', ' SC($)', ' S C($)', 'CO WLL($)', 'CO LIB($)', - ' AS($)', ' A S($)', 'PJSC($)', 'P JSC($)', 'OJSC($)', 'CJSC($)' 'JSC($)', ' EPP($)', ' CB($)', ' C B($)', - ' CA($)', ' C A($)', ' GIE($)', 'KABUSHIKI KAISHA($)', ' KK($)', 'K K($)', ' BV($)', ' B V($)', - 'YUGEN KAISHA', 'YUGEN', 'KAISHA', 'KAISYA', 'YUGEN KAISYA', 'GYOGYO', 'GYOGYOU', 'GAISHA', ' JU($)', - 'OOO($)', '(^)OOO', 'CO PVT($)', 'COMPANY PVT($)', ' PT($)', ' P T($)', '(^)PT', ' CC($)', - ' CO($)', 'COMPANY($)', ' NV($)', ' N V($)', '^NA($)', '^N A($)', 'RPTD SOLD.*', 'OWNER UNKNOWN*', - 'CO LT', 'EHF($)', '(^)EHF'] - text_to_remove = '|'.join(text_to_remove) + text_to_remove = [ + "CO LTD", + "COLTD", + "COMPANY LTD", + "CO LIMITED", + "COMPANY LIMITED", + "CO LIMTED", + "CO LTTD", + "CV LIMITADA", + "LTD SA($)", + "LTD S A($)", + "CO SA($)", + "CO S A($)", + "CO AB($)", + "CO A B($)", + "CO PTY LTD($)", + "CO LRD($)", + "PTY LIMITED($)", + "PTY LTD($)", + "SA PTY LTD($)", + "CORP LTD($)", + "LTDA EPP($)", + "JOINT STOCK COMPANY($)", + "JOINTSTOCK COMPANY($)", + "CORPORATION PTE LTD($)", + "CORPORATION PTE($)", + "CORP PTE($)", + "CORP SA($)", + "CORP INC($)", + "CORPORATION($)", + "CORP($)", + "INCORPORATED($)", + "INC($)", + "AP PTE LTD", + "CO PTE LTD", + "GMBH CO", + "GMBH($)", + "LTD($)", + "LTDA($)", + "LIMITED($)", + "PTE($)", + "LIMITADA($)", + "LDA($)", + "LLC($)", + "COMPANY NV($)", + "COMPANY N V($)", + "COMPANY BV($)", + "COMPANY B V($)", + "CO BV($)", + "CO B V($)", + "CO NV($)", + "CO N V($)", + "SA DE CV($)", + "S A DE C V($)", + "SCL DE CV($)", + "S C L DE C V($)", + "SCL($)", + "S C L($)", + "S C DE R L($)", + "S R L DE C V($)", + "SAC($)", + "S A C($)", + "EIRL($)", + "E I R L($)", + "SRL($)", + "S R L($)", + " CIA($)", + "EURL($)", + "(^)EURL", + "SARL($)", + "(^)SARL", + "SNC($)", + "(^)SNC", + "SPC($)", + "(^)SPC", + "SPA($)", + "SAS($)", + " SA($)", + " S A($)", + " SL($)", + " S L($)", + " SC($)", + " S C($)", + "CO WLL($)", + "CO LIB($)", + " AS($)", + " A S($)", + "PJSC($)", + "P JSC($)", + "OJSC($)", + "CJSC($)" "JSC($)", + " EPP($)", + " CB($)", + " C B($)", + " CA($)", + " C A($)", + " GIE($)", + "KABUSHIKI KAISHA($)", + " KK($)", + "K K($)", + " BV($)", + " B V($)", + "YUGEN KAISHA", + "YUGEN", + "KAISHA", + "KAISYA", + "YUGEN KAISYA", + "GYOGYO", + "GYOGYOU", + "GAISHA", + " JU($)", + "OOO($)", + "(^)OOO", + "CO PVT($)", + "COMPANY PVT($)", + " PT($)", + " P T($)", + "(^)PT", + " CC($)", + " CO($)", + "COMPANY($)", + " NV($)", + " N V($)", + "^NA($)", + "^N A($)", + "RPTD SOLD.*", + "OWNER UNKNOWN*", + "CO LT", + "EHF($)", + "(^)EHF", + ] + text_to_remove = "|".join(text_to_remove) if type(elem) == pd.core.series.Series: elem = elem.apply( - lambda x: unidecode(re.sub(r'\(.+\)', ' ', x)).strip() if (x == x) & (x != None) & (x != '') else None) + lambda x: unidecode(re.sub(r"\(.+\)", " ", x)).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem.apply( - lambda x: unidecode(re.sub(r'[^\w]+', ' ', x)).strip() if (x == x) & (x != None) & (x != '') else None) + lambda x: unidecode(re.sub(r"[^\w]+", " ", x)).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem.apply( - lambda x: re.sub(text_to_remove, ' ', x) if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub(text_to_remove, " ", x) + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem.apply( - lambda x: re.sub(r'\s+', ' ', x).strip() if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub(r"\s+", " ", x).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) return elem.apply( - lambda x: re.sub('FISHERY', 'FISHERIES', x) if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub("FISHERY", "FISHERIES", x) + if (x == x) & (x is not None) & (x != "") + else None + ) elif type(elem) == pd.core.frame.DataFrame: elem = elem[check_field].apply( - lambda x: unidecode(re.sub(r'\(.+\)', ' ', x)).strip() if (x == x) & (x != None) & (x != '') else None) + lambda x: unidecode(re.sub(r"\(.+\)", " ", x)).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem[check_field].apply( - lambda x: unidecode(re.sub(r'[^\w]+', ' ', x)).strip() if (x == x) & (x != None) & (x != '') else None) + lambda x: unidecode(re.sub(r"[^\w]+", " ", x)).strip() + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem[check_field].apply( - lambda x: re.sub(text_to_remove, ' ', x) if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub(text_to_remove, " ", x) + if (x == x) & (x is not None) & (x != "") + else None + ) elem = elem[check_field].apply( - lambda x: re.sub(r'\s+', ' ', x).strip() if (x == x) & (x != None) * (x != '') else None) + lambda x: re.sub(r"\s+", " ", x).strip() + if (x == x) & (x is not None) * (x != "") + else None + ) return elem[check_field].apply( - lambda x: re.sub('FISHERY', 'FISHERIES', x) if (x == x) & (x != None) * (x != '') else None) - elif (elem != elem) | (elem == None) | (elem == '') | (elem == 0): + lambda x: re.sub("FISHERY", "FISHERIES", x) + if (x == x) & (x is not None) * (x != "") + else None + ) + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): return np.nan elif type(elem) == str: - elem = unidecode(re.sub(r'\(.+\)', ' ', elem)).strip() - elem = unidecode(re.sub(r'[^\w]+', ' ', elem)).strip() - elem = re.sub(text_to_remove, ' ', elem) - elem = re.sub(r'\s+', ' ', elem).strip() - return re.sub('FISHERY', 'FISHERIES', elem) + elem = unidecode(re.sub(r"\(.+\)", " ", elem)).strip() + elem = unidecode(re.sub(r"[^\w]+", " ", elem)).strip() + elem = re.sub(text_to_remove, " ", elem) + elem = re.sub(r"\s+", " ", elem).strip() + return re.sub("FISHERY", "FISHERIES", elem) else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None -# -# Standardize Integer in a form of string -# because Pandas Series or DataFrame considers -# a column of integers with Nulls as a column of float -# Save it as a string column so that it can be uploaded -# as integer columns when uploading to BigQuery. -# def standardize_int_str(elem, check_field=True): + """ + This module standardizes an integer in the form of string + because Pandas Series or DataFrame considers a column of integers + with Nulls as a column of float. Save it as a string column so that + it can be uploaded as integer columns when uploading to BigQuery. + + :param elem: Pandas Series, DataFrame, STRING, INT, FLOAT, a data type + that contains a string field + :param check_field: Boolean, field that contains the given strings + :return: Same as the input elem type + """ if check_field: if type(elem) == pd.core.series.Series: return elem.apply( - lambda x: str(int(float(re.sub('[^\d\.]', '', str(x))))) - if (x == x) & (x is not None) & (x != '') else None) + lambda x: str(int(float(re.sub(r"[^\d.]", "", str(x))))) + if (x == x) & (x is not None) & (x != "") + else None + ) elif type(elem) == pd.core.frame.DataFrame: return elem[check_field].apply( - lambda x: str(int(float(re.sub('[^\d\.]', '', str(x))))) - if (x == x) & (x is not None) & (x != '') else None) - elif (elem != elem) | (elem is None) | (elem == ''): + lambda x: str(int(float(re.sub(r"[^\d.]", "", str(x))))) + if (x == x) & (x is not None) & (x != "") + else None + ) + elif (elem != elem) | (elem is None) | (elem == ""): return None elif (type(elem) == str) | (type(elem) == int) | (type(elem) == float): - return str(int(float(re.sub(r'[^\d\.]', '', str(elem))))) + return str(int(float(re.sub(r"[^\d.]", "", str(elem))))) else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None -# -# Standardize timestamp -# def standardize_time(elem, check_field=True): - if check_field: - if type(elem)==pd.core.series.Series: - return elem.apply(lambda x: pd.to_datetime(x, errors='coerce') if (x==x)&(x!=None)&(x!='') else None) - elif type(elem)==pd.core.frame.DataFrame: - return elem[check_field].apply(lambda x: pd.to_datetime(x, errors='coerce') if (x==x)&(x!=None)&(x!='') else None) - elif (elem!=elem)|(elem==None)|(elem=='')|(elem==0): - return np.nan - elif (type(elem)==str)|(type(elem)==pd.Timestamp): - return pd.to_datetime(elem, errors='coerce') - else: - raise ValueError('Unknown type received') - else: - return None - - -def clean_uvi(x): - if (type(x)==float)|(type(x)==int): - if (not np.isnan(x))&(x==x)&(x!=None): - return str(int(x)) - else: - return np.nan - else: - return re.sub('\s+', ' ', x).strip().upper() + """ + This modules standardizes a timestamp + :param elem: Pandas DATAFRAME, SERIES, STRING, a data type containing + time stamp information + :param check_field: Boolean, whether the field that contains + the timestamp information + :return: Same type as the elem input + """ -def standardize_uvi(elem, check_field=True): if check_field: - if type(elem)==pd.core.series.Series: - return elem.apply(lambda x: clean_uvi(x)) - elif type(elem)==pd.core.frame.DataFrame: - return elem[check_field].apply(lambda x: clean_uvi(x)) - elif (elem!=elem)|(elem==None)|(elem=='')|(elem==0): - return None - elif (type(elem)==int)|(type(elem)==float): - return str(int(elem)) - elif type(elem)==str: - return re.sub('\s+',' ',elem).strip().upper() + if type(elem) == pd.core.series.Series: + return elem.apply( + lambda x: pd.to_datetime(x, errors="coerce") + if (x == x) & (x is not None) & (x != "") + else None + ) + elif type(elem) == pd.core.frame.DataFrame: + return elem[check_field].apply( + lambda x: pd.to_datetime(x, errors="coerce") + if (x == x) & (x is not None) & (x != "") + else None + ) + elif (elem != elem) | (elem is None) | (elem == "") | (elem == 0): + return np.nan + elif (type(elem) == str) | (type(elem) == pd.Timestamp): + return pd.to_datetime(elem, errors="coerce") else: - raise ValueError('Unknown type received') + raise ValueError("Unknown type received") else: return None @@ -323,10 +513,10 @@ def standardize_flag(df, field, rules): """ Flag mapping based on YAML mapping file per registry - :param df: - :param field: - :param rules: - :return: + :param df: Pandas DataFrame, a dataframe containing flag information field + :param field: STRING, the name of the field containing flag information + :param rules: DICT, the YAML mapping rule + :return: Pandas Series or STRING """ if field: if rules: @@ -334,17 +524,19 @@ def standardize_flag(df, field, rules): # In case it's explicitly "ALL" as an option, # returns the preset value if "ALL" in rules: - return rules['ALL'] + return rules["ALL"] # # If it's "SAME" option, use the values in the flag field - elif 'SAME' in rules: + elif "SAME" in rules: return df[field] # # iso3 country code - note that all is turned to upper cases else: return df[field].apply( lambda x: rules[unidecode(str(x)).strip().upper()] - if (x == x) & (x is not None) & (x != '') else None) + if (x == x) & (x is not None) & (x != "") + else None + ) else: return None else: @@ -355,16 +547,17 @@ def standardize_geartype(df, field, rules): """ Geartype mapping based on YAML mapping file per registry - :param df: - :param field: - :param rules: - :return: + :param df: Pandas DataFrame, a DataFrame containing geartype + information field + :param field: STRING, the name of the field containing geartype information + :param rules: DICT, the YAML mapping rule + :return: Pandas Series or STRING """ if field: if rules: - if 'ALL' in rules: - return rules['ALL'] - elif 'SAME' in rules: + if "ALL" in rules: + return rules["ALL"] + elif "SAME" in rules: return df[field] # # note that when mapping geartype, @@ -372,8 +565,40 @@ def standardize_geartype(df, field, rules): else: return df[field].apply( lambda x: rules[unidecode(str(x)).strip().lower()] - if (x == x) & (x is not None) & (x != '') else None) + if (x == x) & (x is not None) & (x != "") + else None + ) else: return None else: return None + + +# +# Below is not used. +# def clean_uvi(x): +# if (type(x) == float) | (type(x) == int): +# if (not np.isnan(x)) & (x == x) & (x is not None): +# return str(int(x)) +# else: +# return np.nan +# else: +# return re.sub("\s+", " ", x).strip().upper() +# +# +# def standardize_uvi(elem, check_field=True): +# if check_field: +# if type(elem) == pd.core.series.Series: +# return elem.apply(lambda x: clean_uvi(x)) +# elif type(elem) == pd.core.frame.DataFrame: +# return elem[check_field].apply(lambda x: clean_uvi(x)) +# elif (elem != elem) | (elem == None) | (elem == "") | (elem == 0): +# return None +# elif (type(elem) == int) | (type(elem) == float): +# return str(int(elem)) +# elif type(elem) == str: +# return re.sub("\s+", " ", elem).strip().upper() +# else: +# raise ValueError("Unknown type received") +# else: +# return None diff --git a/shipdataprocess/__init__.py b/shipdataprocess/__init__.py index de625df..444767d 100644 --- a/shipdataprocess/__init__.py +++ b/shipdataprocess/__init__.py @@ -3,7 +3,7 @@ """ -__version__ = "0.7.1" +__version__ = "0.7.0" __author__ = "Jaeyoon Park" __email__ = "jaeyoon@globalfishingwatch.org" __source__ = "https://github.com/GlobalFishingWatch/shipdataprocess" From 869fc834dbf05ac12248265b7cd9d07af1b49d7d Mon Sep 17 00:00:00 2001 From: jaeyoonpark Date: Fri, 28 Jan 2022 00:22:45 +0100 Subject: [PATCH 4/4] print bug fixed --- CHANGES.md | 1 + build/lib/shipdataprocess/__init__.py | 2 +- build/lib/shipdataprocess/normalize.py | 9 ++------- shipdataprocess/__init__.py | 2 +- shipdataprocess/normalize.py | 9 ++------- 5 files changed, 7 insertions(+), 16 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 2129f6f..e43ea06 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -39,3 +39,4 @@ v0.6.16, 2020-11-26 -- Make smart_upper() to capture multiple URLs not to capita v0.6.17, 2021-07-30 -- Add Indonesian prefix and Chinese HAO v0.6.18, 2021-08-04 -- Fix a bug in normalize_callsign() regarding NULL/NONE v0.7.0, 2022-01-26 -- Fix it to work only in Python 3.6 or above, codes are compliant with PEP8, dependencies are clearer (Django removed) +v0.7.1, 2022-01-27 -- Bug fixed diff --git a/build/lib/shipdataprocess/__init__.py b/build/lib/shipdataprocess/__init__.py index 444767d..de625df 100644 --- a/build/lib/shipdataprocess/__init__.py +++ b/build/lib/shipdataprocess/__init__.py @@ -3,7 +3,7 @@ """ -__version__ = "0.7.0" +__version__ = "0.7.1" __author__ = "Jaeyoon Park" __email__ = "jaeyoon@globalfishingwatch.org" __source__ = "https://github.com/GlobalFishingWatch/shipdataprocess" diff --git a/build/lib/shipdataprocess/normalize.py b/build/lib/shipdataprocess/normalize.py index c95a2cd..a374ba7 100644 --- a/build/lib/shipdataprocess/normalize.py +++ b/build/lib/shipdataprocess/normalize.py @@ -22,14 +22,9 @@ def normalize_shipname(name): if (name is None) | (name != name) | (name == ""): return None - print(name) + # # Remove nasty characters and white spaces - # try: - # name = unidecode(str(name.decode("utf-8"))) - # except UnicodeDecodeError: - # name = unidecode(str(name.decode("iso_8859-1"))) - if issubclass(type(name), str): name = unidecode(name) elif isinstance(name, bytes): @@ -41,7 +36,7 @@ def normalize_shipname(name): name = str(name) else: return None - print(name) + # # Turn to upper cases name = name.upper() diff --git a/shipdataprocess/__init__.py b/shipdataprocess/__init__.py index 444767d..de625df 100644 --- a/shipdataprocess/__init__.py +++ b/shipdataprocess/__init__.py @@ -3,7 +3,7 @@ """ -__version__ = "0.7.0" +__version__ = "0.7.1" __author__ = "Jaeyoon Park" __email__ = "jaeyoon@globalfishingwatch.org" __source__ = "https://github.com/GlobalFishingWatch/shipdataprocess" diff --git a/shipdataprocess/normalize.py b/shipdataprocess/normalize.py index c95a2cd..a374ba7 100644 --- a/shipdataprocess/normalize.py +++ b/shipdataprocess/normalize.py @@ -22,14 +22,9 @@ def normalize_shipname(name): if (name is None) | (name != name) | (name == ""): return None - print(name) + # # Remove nasty characters and white spaces - # try: - # name = unidecode(str(name.decode("utf-8"))) - # except UnicodeDecodeError: - # name = unidecode(str(name.decode("iso_8859-1"))) - if issubclass(type(name), str): name = unidecode(name) elif isinstance(name, bytes): @@ -41,7 +36,7 @@ def normalize_shipname(name): name = str(name) else: return None - print(name) + # # Turn to upper cases name = name.upper()