Merge pull request #2 from GlobalFishingWatch/clean_repo

Merge Clean_repo branch to master
GlobalFishingWatch · Apr 28, 2023 · a8b622a · a8b622a
2 parents 3896c8c + 869fc83
commit a8b622a
Show file tree

Hide file tree

Showing 17 changed files with 2,285 additions and 1,240 deletions.
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,7 @@ sdist/
 var/
 wheels/
 *.egg-info/
+/*.egg-info/*
 .installed.cfg
 *.egg
 MANIFEST

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,11 @@
+repos:
+-   repo: https://github.com/psf/black
+    rev: 21.11b0
+    hooks:
+    - id: black
+      args: [--line-length=79]
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.9.2
+    hooks:
+    - id: flake8
+
diff --git a/CHANGES.md b/CHANGES.md
@@ -38,5 +38,5 @@ v0.6.15, 2020-11-06 -- Fix a bug in normalize_shipname() and normalize_callsign(
 v0.6.16, 2020-11-26 -- Make smart_upper() to capture multiple URLs not to capitalize them
 v0.6.17, 2021-07-30 -- Add Indonesian prefix and Chinese HAO
 v0.6.18, 2021-08-04 -- Fix a bug in normalize_callsign() regarding NULL/NONE
-
-
+v0.7.0, 2022-01-26 -- Fix it to work only in Python 3.6 or above, codes are compliant with PEP8, dependencies are clearer (Django removed)
+v0.7.1, 2022-01-27 -- Bug fixed
diff --git a/build/lib/shipdataprocess/__init__.py b/build/lib/shipdataprocess/__init__.py
@@ -3,15 +3,15 @@
 """
 
 
-__version__ = '0.6.18'
-__author__ = 'Jaeyoon Park'
-__email__ = 'jaeyoon[email protected]'
-__source__ = 'https://github.com/GlobalFishingWatch/shipdataprocess'
+__version__ = "0.7.1"
+__author__ = "Jaeyoon Park"
+__email__ = "jaeyoon@globalfishingwatch.org"
+__source__ = "https://github.com/GlobalFishingWatch/shipdataprocess"
 __license__ = """
 Copyright 2017 Global Fishing Watch Inc.
 Authors:
 
-Jaeyoon Park <jaeyoon[email protected]>
+Jaeyoon Park <jaeyoon@globalfishingwatch.org>
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

diff --git a/build/lib/shipdataprocess/collapse.py b/build/lib/shipdataprocess/collapse.py
@@ -1,32 +1,51 @@
+"""
+This file contains functions that help collapse (multiple) rows
+for each vessel found in the process of producing Global Fishing Watch's
+internal vessel database.
+
+Last updated: 2022-01-24
+Jaeyoon Park
+"""
+
 import pandas as pd
 import numpy as np
 import re
 from collections import Counter
 
 
-### helper functions for collapsing rows by vessel
-
 def non_zero_mean(x):
     try:
-        x = x[(x!=0)&(x!=None)]
-        if len(x)==0: return 0.0
-        else: return x.mean()
-    except:
+        x = x[(x != 0) & (x is not None)]
+        if len(x) == 0:
+            return 0.0
+        else:
+            return x.mean()
+    except AttributeError:
         return 0.0
-
+
+
 def non_zero_std(x):
     try:
-        x = x[(x!=0)&(x!=None)]
-        if len(x)<2: return 0.0
-        else: return x.std()
-    except: 
+        x = x[(x != 0) & (x is not None)]
+        if len(x) < 2:
+            return 0.0
+        else:
+            return x.std()
+    except AttributeError:
         return 0.0
-
-def most_common_value(x): ## remove if standard deviation is too big compared to mean value of all numbers
-    '''remove if standard deviation is too big compared to mean value of all numbers'''
-    if (type(x)==list)&(len(x)>0):
+
+
+def most_common_value(x):
+    """
+    Remove if standard deviation is too big compared to mean value of
+    all numbers. The standard deviation threshold is set to be 10%.
+
+    x: Pandas Series or list, a list of numerical values
+    (for length, tonnage, engine power)
+    """
+    if (type(x) == list) & (len(x) > 0):
         x = pd.Series(x)
-    if (type(x)==pd.core.series.Series)&(len(x.dropna())>0):
+    if (type(x) == pd.core.series.Series) & (len(x.dropna()) > 0):
         x_mean = non_zero_mean(x)
         x_std = non_zero_std(x)
         if x_std > x_mean * 0.1:
@@ -36,95 +55,156 @@ def most_common_value(x): ## remove if standard deviation is too big compared to
     else:
         return np.nan
 
+
 def most_common_value_with_confidence(cx):
-    '''same functionality as most_common_value() but with confidence level taken account'''
-    if (type(cx)==pd.core.series.Series)&(len(cx)>0):
-        if len(cx.dropna())==0:
+    """
+    same functionality as most_common_value() but with confidence level
+    taken into account
+
+    cx: Pandas Series or list, a list of numerical values
+    (for length, tonnage, engine power)
+    with a confidence level indicator attached with '-' in front of the value.
+    """
+    if (type(cx) == pd.core.series.Series) & (len(cx) > 0):
+        if len(cx.dropna()) == 0:
             return np.nan
         else:
             cx = list(cx.values)
-    if (type(cx)==list)&(len(cx)>0):
-        clist = [int(elem.split('-')[0]) for elem in cx if (elem==elem)&(elem!=None)]
-        xlist = [elem for elem in cx if (elem==elem)&(elem!=None)]
-        if len(clist)>0:
+    if (type(cx) == list) & (len(cx) > 0):
+        clist = [
+            int(elem.split("-")[0])
+            for elem in cx
+            if (elem == elem) & (elem is not None)
+        ]
+        xlist = [elem for elem in cx if (elem == elem) & (elem is not None)]
+        if len(clist) > 0:
             max_c = max(clist)
-            x = [float(elem.split('-')[1]) for elem in xlist if int(elem.split('-')[0])==max_c]
+            x = [
+                float(elem.split("-")[1])
+                for elem in xlist
+                if int(elem.split("-")[0]) == max_c
+            ]
+            # Call the function to return the most common value
             return most_common_value(x)
         else:
             return np.nan
     else:
         return np.nan
-
-def most_common_num(x): ## mostly for imo collapsing
+
+
+def most_common_num(x):
+    """
+    Return the most common number (mostly for imo collapsing).
+
+    x: Pandas Series, a list of numbers
+    """
     try:
         x = x.dropna()
-        if len(x)==0:
+        if len(x) == 0:
             return np.nan
         else:
             vals = x.values
-            vs = [v for v in vals if (v!=0)]
-        #vs = list(set(vs))
-            if len(vs)==0:
+            vs = [v for v in vals if (v != 0)]
+            # vs = list(set(vs))
+            if len(vs) == 0:
                 return np.nan
-            else: 
+            else:
                 data = Counter(vs)
                 return max(vs, key=data.get)
-    except:
+    except AttributeError:
         return np.nan
-
+
+
 def most_common_str(x):
+    """
+    Return the most common string.
+
+    x: Pandas Series, a list of values in string
+    """
     try:
         x = x.dropna()
-        if len(x)==0:
+        if len(x) == 0:
             return np.nan
         else:
-            vals = x.values
-            vs = [re.sub('\s+',' ',str(v)).strip().upper() for v in x.values]
-            vs = [v for v in vs if v!='']
-        #vs = list(set(vs))
-            if len(vs)==0:
+            vs = [
+                re.sub(r"\s+", " ", str(v)).strip().upper() for v in x.values
+            ]
+            vs = [v for v in vs if v != ""]
+            # vs = list(set(vs))
+            if len(vs) == 0:
                 return np.nan
             else:
                 data = Counter(vs)
                 return max(vs, key=data.get)
 
-        #if len(vs)==1:
-        #    return vs[0]
-        #else:
-        #    return None
-    except:
+    except AttributeError:
         return np.nan
-
-def str_attached(x): ## join all strings
+
+
+def str_attached(x):
+    """
+    Return all strings joined. If the values are in numbers, convert them
+    to string and combined.
+
+    :param x: Pandas Series or list
+    :return: A joined string
+    """
     try:
         x = x.dropna()
-        if len(x)==0:
+        if len(x) == 0:
             return np.nan
         else:
-            x = x.apply(lambda v: str(int(v)) if (type(v)==float)|(type(v)==int)|(type(v)==long) else v)  
+            x = x.apply(
+                lambda v: str(int(v))
+                if (type(v) == float) | (type(v) == int)
+                else v
+            )
             vals = x.values.tolist()
-        #vs = [str(v).strip() for v in vals if (v==v)&(v!=None)&(v!='')]
-        #vs = [v for v in vs if (v!='')]
+            # vs = [str(v).strip() for v in vals if (v==v)&(v!=None)&(v!='')]
+            # vs = [v for v in vs if (v!='')]
             vs = list(set(vals))
-            return ', '.join(sorted(vs))
-    except:
+            return ", ".join(sorted(vs))
+    except AttributeError:
         return np.nan
-
+
+
 def min_time(x):
+    """
+    Return the minimum time
+
+    :param x: Pandas Series
+    :return: Timestamp
+    """
     vals = x.values
-    vs = [v for v in vals if (v==v)&(v!=None)&(v!='')]
+    vs = [v for v in vals if (v == v) & (v is not None) & (v != "")]
     vs = pd.Series(vs)
+
     return vs.min()
 
+
 def max_time(x):
+    """
+    Return the maximum time
+
+    :param x: Pandas Series
+    :return: Timestamp
+    """
     vals = x.values
-    vs = [v for v in vals if (v==v)&(v!=None)&(v!='')]
+    vs = [v for v in vals if (v == v) & (v is not None) & (v != "")]
     vs = pd.Series(vs)
+
     return vs.max()
 
+
 def highest_confidence(x):
+    """
+    Return the maximum confidence if none return 1 (the lowest).
+
+    :param x: Pandas Series or list
+    :return: Integer
+    """
     x = x.dropna()
-    if len(x)>0:
+    if len(x) > 0:
         return max(x.tolist())
     else:
         return 1