Skip to content

Commit

Permalink
Merge pull request #2 from GlobalFishingWatch/clean_repo
Browse files Browse the repository at this point in the history
Merge Clean_repo branch to master
  • Loading branch information
jaeyoonpark authored Apr 28, 2023
2 parents 3896c8c + 869fc83 commit a8b622a
Show file tree
Hide file tree
Showing 17 changed files with 2,285 additions and 1,240 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ sdist/
var/
wheels/
*.egg-info/
/*.egg-info/*
.installed.cfg
*.egg
MANIFEST
Expand Down
11 changes: 11 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
repos:
- repo: https://github.com/psf/black
rev: 21.11b0
hooks:
- id: black
args: [--line-length=79]
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
hooks:
- id: flake8

4 changes: 2 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,5 @@ v0.6.15, 2020-11-06 -- Fix a bug in normalize_shipname() and normalize_callsign(
v0.6.16, 2020-11-26 -- Make smart_upper() to capture multiple URLs not to capitalize them
v0.6.17, 2021-07-30 -- Add Indonesian prefix and Chinese HAO
v0.6.18, 2021-08-04 -- Fix a bug in normalize_callsign() regarding NULL/NONE


v0.7.0, 2022-01-26 -- Fix it to work only in Python 3.6 or above, codes are compliant with PEP8, dependencies are clearer (Django removed)
v0.7.1, 2022-01-27 -- Bug fixed
10 changes: 5 additions & 5 deletions build/lib/shipdataprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
"""


__version__ = '0.6.18'
__author__ = 'Jaeyoon Park'
__email__ = 'jaeyoon[email protected]'
__source__ = 'https://github.com/GlobalFishingWatch/shipdataprocess'
__version__ = "0.7.1"
__author__ = "Jaeyoon Park"
__email__ = "jaeyoon@globalfishingwatch.org"
__source__ = "https://github.com/GlobalFishingWatch/shipdataprocess"
__license__ = """
Copyright 2017 Global Fishing Watch Inc.
Authors:
Jaeyoon Park <jaeyoon[email protected]>
Jaeyoon Park <jaeyoon@globalfishingwatch.org>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
192 changes: 136 additions & 56 deletions build/lib/shipdataprocess/collapse.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,51 @@
"""
This file contains functions that help collapse (multiple) rows
for each vessel found in the process of producing Global Fishing Watch's
internal vessel database.
Last updated: 2022-01-24
Jaeyoon Park
"""

import pandas as pd
import numpy as np
import re
from collections import Counter


### helper functions for collapsing rows by vessel

def non_zero_mean(x):
try:
x = x[(x!=0)&(x!=None)]
if len(x)==0: return 0.0
else: return x.mean()
except:
x = x[(x != 0) & (x is not None)]
if len(x) == 0:
return 0.0
else:
return x.mean()
except AttributeError:
return 0.0



def non_zero_std(x):
try:
x = x[(x!=0)&(x!=None)]
if len(x)<2: return 0.0
else: return x.std()
except:
x = x[(x != 0) & (x is not None)]
if len(x) < 2:
return 0.0
else:
return x.std()
except AttributeError:
return 0.0

def most_common_value(x): ## remove if standard deviation is too big compared to mean value of all numbers
'''remove if standard deviation is too big compared to mean value of all numbers'''
if (type(x)==list)&(len(x)>0):


def most_common_value(x):
"""
Remove if standard deviation is too big compared to mean value of
all numbers. The standard deviation threshold is set to be 10%.
x: Pandas Series or list, a list of numerical values
(for length, tonnage, engine power)
"""
if (type(x) == list) & (len(x) > 0):
x = pd.Series(x)
if (type(x)==pd.core.series.Series)&(len(x.dropna())>0):
if (type(x) == pd.core.series.Series) & (len(x.dropna()) > 0):
x_mean = non_zero_mean(x)
x_std = non_zero_std(x)
if x_std > x_mean * 0.1:
Expand All @@ -36,95 +55,156 @@ def most_common_value(x): ## remove if standard deviation is too big compared to
else:
return np.nan


def most_common_value_with_confidence(cx):
'''same functionality as most_common_value() but with confidence level taken account'''
if (type(cx)==pd.core.series.Series)&(len(cx)>0):
if len(cx.dropna())==0:
"""
same functionality as most_common_value() but with confidence level
taken into account
cx: Pandas Series or list, a list of numerical values
(for length, tonnage, engine power)
with a confidence level indicator attached with '-' in front of the value.
"""
if (type(cx) == pd.core.series.Series) & (len(cx) > 0):
if len(cx.dropna()) == 0:
return np.nan
else:
cx = list(cx.values)
if (type(cx)==list)&(len(cx)>0):
clist = [int(elem.split('-')[0]) for elem in cx if (elem==elem)&(elem!=None)]
xlist = [elem for elem in cx if (elem==elem)&(elem!=None)]
if len(clist)>0:
if (type(cx) == list) & (len(cx) > 0):
clist = [
int(elem.split("-")[0])
for elem in cx
if (elem == elem) & (elem is not None)
]
xlist = [elem for elem in cx if (elem == elem) & (elem is not None)]
if len(clist) > 0:
max_c = max(clist)
x = [float(elem.split('-')[1]) for elem in xlist if int(elem.split('-')[0])==max_c]
x = [
float(elem.split("-")[1])
for elem in xlist
if int(elem.split("-")[0]) == max_c
]
# Call the function to return the most common value
return most_common_value(x)
else:
return np.nan
else:
return np.nan

def most_common_num(x): ## mostly for imo collapsing


def most_common_num(x):
"""
Return the most common number (mostly for imo collapsing).
x: Pandas Series, a list of numbers
"""
try:
x = x.dropna()
if len(x)==0:
if len(x) == 0:
return np.nan
else:
vals = x.values
vs = [v for v in vals if (v!=0)]
#vs = list(set(vs))
if len(vs)==0:
vs = [v for v in vals if (v != 0)]
# vs = list(set(vs))
if len(vs) == 0:
return np.nan
else:
else:
data = Counter(vs)
return max(vs, key=data.get)
except:
except AttributeError:
return np.nan



def most_common_str(x):
"""
Return the most common string.
x: Pandas Series, a list of values in string
"""
try:
x = x.dropna()
if len(x)==0:
if len(x) == 0:
return np.nan
else:
vals = x.values
vs = [re.sub('\s+',' ',str(v)).strip().upper() for v in x.values]
vs = [v for v in vs if v!='']
#vs = list(set(vs))
if len(vs)==0:
vs = [
re.sub(r"\s+", " ", str(v)).strip().upper() for v in x.values
]
vs = [v for v in vs if v != ""]
# vs = list(set(vs))
if len(vs) == 0:
return np.nan
else:
data = Counter(vs)
return max(vs, key=data.get)

#if len(vs)==1:
# return vs[0]
#else:
# return None
except:
except AttributeError:
return np.nan

def str_attached(x): ## join all strings


def str_attached(x):
"""
Return all strings joined. If the values are in numbers, convert them
to string and combined.
:param x: Pandas Series or list
:return: A joined string
"""
try:
x = x.dropna()
if len(x)==0:
if len(x) == 0:
return np.nan
else:
x = x.apply(lambda v: str(int(v)) if (type(v)==float)|(type(v)==int)|(type(v)==long) else v)
x = x.apply(
lambda v: str(int(v))
if (type(v) == float) | (type(v) == int)
else v
)
vals = x.values.tolist()
#vs = [str(v).strip() for v in vals if (v==v)&(v!=None)&(v!='')]
#vs = [v for v in vs if (v!='')]
# vs = [str(v).strip() for v in vals if (v==v)&(v!=None)&(v!='')]
# vs = [v for v in vs if (v!='')]
vs = list(set(vals))
return ', '.join(sorted(vs))
except:
return ", ".join(sorted(vs))
except AttributeError:
return np.nan



def min_time(x):
"""
Return the minimum time
:param x: Pandas Series
:return: Timestamp
"""
vals = x.values
vs = [v for v in vals if (v==v)&(v!=None)&(v!='')]
vs = [v for v in vals if (v == v) & (v is not None) & (v != "")]
vs = pd.Series(vs)

return vs.min()


def max_time(x):
"""
Return the maximum time
:param x: Pandas Series
:return: Timestamp
"""
vals = x.values
vs = [v for v in vals if (v==v)&(v!=None)&(v!='')]
vs = [v for v in vals if (v == v) & (v is not None) & (v != "")]
vs = pd.Series(vs)

return vs.max()


def highest_confidence(x):
"""
Return the maximum confidence if none return 1 (the lowest).
:param x: Pandas Series or list
:return: Integer
"""
x = x.dropna()
if len(x)>0:
if len(x) > 0:
return max(x.tolist())
else:
return 1
Loading

0 comments on commit a8b622a

Please sign in to comment.