diff --git a/drifter_processing.py b/drifter_processing.py new file mode 100644 index 0000000..b16b265 --- /dev/null +++ b/drifter_processing.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Nov 7 16:00:10 2019 + +@author: strausz +""" + +import pandas as pd +import cartopy.crs as ccrs +import matplotlib.pyplot as plt +#import cartopy.feature as cfeature +import cmocean +#for calculating distance +from haversine import haversine +import argparse +from datetime import datetime +import numpy as np +import re +from erddapy import ERDDAP +import mysql.connector + + + +parser = argparse.ArgumentParser(description='Plot drifter track on map') +parser.add_argument('-if','--infile', nargs=1, type=str, + help='full path to input file') +parser.add_argument('-p', '--plot', type=str, + help="make plot of 'sst', 'strain', or 'speed' ") +parser.add_argument('-f', '--file', action="store_true", + help="output csv file of data") +parser.add_argument('-i', '--ice', action="store_true", + help="add ice concentration as last field and output file") +parser.add_argument('-ph', '--phyllis', action="store_true", + help="output format for phyllis friendly processing") +parser.add_argument('-d', '--date', action="store_true", + help="add occasional date to track") +parser.add_argument('-e', '--erddap', nargs='+', + help="get directly from akutan erddap server, requires argos id followed desired years") +parser.add_argument('-H', '--hour', action="store_true", + help="resample all data to even hour and interpolate") +parser.add_argument('-s', '--speed', action="store_true", + help="add speed column") +parser.add_argument('-V', '--vecdis', action="store_true", + help="import a vecdis file as input") +parser.add_argument('-l', '--legacy', nargs='?', + help="file has legacy format from ecofoci website, if file contains ice concentraion, add 'i'") +parser.add_argument('-c', '--cut', nargs='*', + type=lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S"), + help="date span in format '2019-01-01T00:00:00 2019-01-01T00:00:01' for example") +parser.add_argument('-de', '--despike', action="store_true", + help="Do some simple despiking of sst") +args=parser.parse_args() + + +#the following is info needed for adding the ice concentration +latfile='/home/makushin/strausz/ecofoci_github/EcoFOCI_ssmi_ice/psn25lats_v3.dat' +lonfile='/home/makushin/strausz/ecofoci_github/EcoFOCI_ssmi_ice/psn25lons_v3.dat' +#locations of ice files +bootstrap = '/home/akutan/strausz/ssmi_ice/data/bootstrap/' +nrt = '/home/akutan/strausz/ssmi_ice/data/nrt/' +#latest available bootstrap year will need to be changed as new data comes in +boot_year = 2018 + +if args.infile: + filename=args.infile[0] +else: + filename='' +def decode_datafile(filename): + #determine if it's nrt or bootstrap from filename prefix + #note that we remove path first if it exists + prefix = filename.split('/')[-1:][0][:2] + icefile = open(filename, 'rb') + + if prefix == 'nt': + #remove the header + icefile.seek(300) + ice = np.fromfile(icefile,dtype=np.uint8) + ice[ice >= 253] = 0 + ice = ice/2.5 + elif prefix == 'bt': + ice = np.fromfile(icefile,dtype=np.uint16) + ice = ice/10. + ice[ice == 110] = 0 #110 is land + ice[ice == 120] = 100 #120 is polar hole + else: + ice=np.nan + + icefile.close() + + return ice; + +def decode_latlon(filename): + latlon_file = open(filename, 'rb') + output = np.fromfile(latlon_file,dtype='= 180 else x * -1) + df.set_index(['datetime'], inplace=True) + trajectory_id=re.search(r'(\d{5,})', filename).group(0) + df['trajectory_id']=trajectory_id + +elif args.vecdis: + names = ['year','day','time','latitude','longitude', 'speed', 'direction', 'U', 'V'] + dtypes = {'year':str, 'day':str, 'time':str} + dateparser = lambda x: pd.datetime.strptime(x, "%Y%j%H%M") + df=pd.read_csv(filename, sep='\s+', skiprows=2, header=0, names=names, + dtype=dtypes) + #to make W longitude negative and E positive + df['time'] = df.time.str.zfill(4) + df['datetime'] = pd.to_datetime((df.year+df.day+df.time), format="%Y%j%H%M") + df['longitude'] = df.longitude.apply(lambda x: x*-1+360 if x >= 180 else x * -1) + df.set_index(['datetime'], inplace=True) + trajectory_id=re.search(r'(\d{5,})', filename).group(0) + df['trajectory_id']=trajectory_id + df['speed2'] =(df.U**2 + df.V**2)**(1/2) + #calculate overall speed + +else: + df = pd.read_csv(filename) + + df['datetime'] = pd.to_datetime(df['year_doy_hhmm'], format='%Y-%m-%d %H:%M:%S') + df['datetime'] = df.datetime.dt.tz_localize(None) #to remove timezone info + + df.set_index(['datetime'], inplace=True) + df.drop(columns=['year_doy_hhmm','year_doy_hhmm.1'], inplace=True) + df['longitude'] = df.longitude * -1 + +if args.cut or args.cut == []: + + df = trim_data(df, args.cut) + +if args.hour: #resample data to on an even hour + df_hour = hour(df) + +if args.speed: #now calculate distance for drifter speed calculation + + df_speed = speed(df_hour) +#now can do plotting stuff if selected +if args.ice: + df_hour['lon_360'] = df_hour.apply(lambda x: lon_360(x.longitude), axis=1) + df_hour['datetime'] = df_hour.index + df_hour.dropna(inplace=True) + df_hour['trajectory_id']=df_hour.trajectory_id.astype(int) + df_hour['latitude']=df_hour.latitude.round(decimals=3) + df_hour['longitude']=df_hour.longitude.round(decimals=3) + df_hour['voltage']=df_hour.voltage.round(decimals=2) + df_hour['sst']=df_hour.sst.round(decimals=2) + df_hour['strain']=df_hour.strain.round(decimals=2) + #now group by doy + #add blank ice column + #df_hour['ice_concentration'] = '' + ice_conc = [] + groups = df_hour.groupby(df_hour.index.dayofyear) + for name, group in df_hour.groupby(df_hour.index.dayofyear): + #print(name) + #print(group.latitude) + date = group.iloc[0].datetime.strftime("%Y%m%d") + if group.iloc[0].datetime.year <= boot_year: + ice_file = bootstrap + str(group.iloc[0].datetime.year) + "/" + "bt_" + date + "_f17_v3.1_n.bin" + else: + ice_file = nrt + "nt_" + date + "_f18_nrt_n.bin" + print("opening file: " + ice_file) + wlon = group.lon_360.min() - .25 + elon = group.lon_360.max() + .25 + nlat = group.latitude.max() + .25 + slat = group.latitude.min() - .25 + data_ice={'latitude':decode_latlon(latfile), 'longitude':decode_latlon(lonfile), + 'ice_conc':decode_datafile(ice_file)} + df_hour_ice=pd.DataFrame(data_ice) + df_hour_ice.dropna(inplace=True) + + df_hour_ice['lon_360'] = df_hour_ice.apply(lambda x: lon_360(x.longitude), axis=1) + df_hour_ice_chopped = df_hour_ice[(df_hour_ice.latitude < nlat) & (df_hour_ice.latitude > slat) & (df_hour_ice.lon_360 > wlon) & (df_hour_ice.lon_360 < elon)] + ice_conc = ice_conc + group.apply(lambda x: get_ice(x, df_hour_ice_chopped), axis=1).to_list() + + + #print("the ice concentration is: "+ice_concentration) + #df_hour_ice_chopped['dist'] = df_hour_ice_chopped.apply(lambda x: haversine((data.latitude, data.longitude), (x.latitude, x.longitude)), axis=1) + df_hour['ice_concentration'] = ice_conc + df_hour=df_hour.drop(['lon_360', 'datetime'], axis=1) + df_out = df_hour[['trajectory_id','latitude','longitude','sst','strain','voltage','speed','ice_concentration']] + df_out = df_out.round({'latitude':3, 'longitude':3,'sst':2,'strain':1,'voltage':1,'speed':1, 'ice_concentration':1}) + outfile = str(df_hour.trajectory_id[0]) + "_with_ice.csv" + df_out.to_csv(outfile) + +if args.plot: + fig, ax, plot_file = plot_variable(df, args.plot, filename) +# ax.scatter(df_hour['longitude'], df_hour['latitude'], s=10, c=df_hour.speed, transform=ccrs.PlateCarree(), +# cmap=cmocean.cm.speed, vmin=0, vmax=120 ) + fig.savefig(plot_file) + +if args.file: + if args.hour: + df_out = df_hour + #df_out = df[['trajectory_id','latitude','longitude','sst','strain','voltage','speed']] + #df_out = df_out.round({'latitude':3, 'longitude':3,'sst':2,'strain':1,'voltage':1,'speed':1}) + else: + df_out = df + + if args.cut: + outfile = str(df_out.trajectory_id[0]) + '_trimmed.csv' + else: + outfile = str(df_out.trajectory_id[0]) + '_reformatted.csv' + df_out.to_csv(outfile) + +if args.despike: + #create empty df + + df['sst_pass1'] = df.sst + df['sst_pass2'] = df.sst + #group by day first + #grouped = df.groupby(df.index.date) + #group by given number of rows + #first drop obvious spikes + df_orig = df + numrows = 50 + pass1_array = np.arange(len(df)) // 50 + pass2_array = pass1_array[25:] + last_group = len(df) // 50 + end_array = np.arange(25)*0+last_group + pass2_array = np.append(pass2_array, end_array) + pd.set_option('display.max.row', None) + def remove_spikes(df, array, var): + #first remove obvious spikes + df.loc[(df[var] > 18) | (df[var] < -2.6), var] = np.nan + df_ds = pd.DataFrame() + grouped = df.groupby(array) + + argos_id = str(df.trajectory_id[0]) + + f = open(argos_id + "_despiked_" + var + ".log", 'w') + for name, group in grouped: + f.write("Standard deviation for SST: ") + f.write(str(round(group[var].std(),3))+"\n") + f.write("Mean of SST: ") + f.write(str(round(group[var].mean(),3))+"\n") + + #try using loc to set to nan + #ie df.loc[df.sst>10, 'sst']=np.nan + upper = group[var].mean() + group[var].std()*2 + lower = group[var].mean() - group[var].std()*2 + f.write("Removed any values > " + str(round(upper, 3)) + " or < " + + str(round(lower, 3)) + "\n") + group.loc[(group[var] > upper) | (group[var] < lower), var] = np.nan + #print(group[['sst', 'sst_orig']]) + f.write("Number of spikes removed: ") + f.write(str(group[var].isna().sum())+"\n") + f.write(group[['sst', var]].to_string()) + f.write("\n----------------------------------------------------\n") + #despiked = group[(group.sst < group.sst.mean() + group.sst.std()*3) & (group.sst > group.sst.mean() - group.sst.std()*3) ] + df_ds = pd.concat([df_ds, group]) + #df_ds = pd.concat(group[(group.sst < group.sst.mean() + group.sst.std()*2) & (group.sst > group.sst.mean() - group.sst.std()*2) ]) + pd.set_option('display.max.row', 10) + f.write("Total Number of spikes removed: ") + f.write(str(df_ds.sst.isna().sum())) + f.close() + return df_ds + df_ds = remove_spikes(df, pass1_array, 'sst_pass1') + df_ds = remove_spikes(df_ds, pass2_array, 'sst_pass2') + df_ds['sst_combined'] = df_ds['sst_pass1'].combine_first(df_ds["sst_pass2"]) + #reorder columns so the print nicely + df_ds = df_ds[['trajectory_id', 'latitude', 'longitude', 'strain', 'voltage', 'sst', 'sst_pass1', 'sst_pass2', 'sst_combined']] + argos_id = str(df.trajectory_id[0]) + f = open(argos_id + "_despiked_full_.log", 'w') + f.write(df_ds.to_string()) + f.close() + pd.set_option('display.max.row', 10) +if args.phyllis: + df_hour['doy'] = df_hour.index.strftime('%j') + df_hour['hour'] = df_hour.index.strftime('%H') + df_hour['minute'] = df_hour.index.strftime('%M') + df_phy = df_hour[['doy','hour','minute','latitude','longitude']] + df_phy['longitude'] = df_phy.longitude * -1 + df_phy = df_phy.round({'latitude':3,'longitude':3}) + outfile = str(df_hour.trajectory_id[0]) + '_for_phyllis.csv' + df_phy.to_csv(outfile, sep=" ", index=False) + + +