Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
barneydobson authored Dec 17, 2019
1 parent 2a1404d commit a4c68c8
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 0 deletions.
61 changes: 61 additions & 0 deletions scripts/download_haduk_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
"""
This is a script to download hadUK data, written by Barnaby Dobson, 5th, Dec, 2019
"""

import os
from tqdm import tqdm
import haduk_downloader_functions

"""User data
This includes, username, password, output location and details for the download
"""

data={'username':'USER_NAME',
'password':'PASSWORD'} # create an account here: https://services.ceda.ac.uk/cedasite/register/info/


output_folder = os.path.join("C:\\","Users","bdobson","Downloads","")
start_year = 1975
end_year = 1980 # Up to but not including the end_year
grid_scale = 25 # km
period = 'day' #'day', 'mon' or 'ann'
variable = 'rainfall' #tested 'rainfall' and 'tas' (temperature)


"""Website information
Give site and file urls
Create dates and file names
"""

site_url = 'https://auth.ceda.ac.uk/account/signin/' # login page

file_url_base = 'http://dap.ceda.ac.uk/thredds/fileServer/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.0.1.0/' # hadUK data base address
version = 'v20190808' # the latest release I suppose

dates_series = haduk_downloader_functions.createDateseries(period,start_year,end_year)

file_name = haduk_downloader_functions.getFilename(variable,grid_scale,period)

file_location = haduk_downloader_functions.getFileweblocation(grid_scale,variable,period,version)


"""Login
"""

try:
s = haduk_downloader_functions.startClient(data,site_url)
except:
cert_location = 'qvevsslg3.pem' # Available from https://www.quovadisglobal.com/QVRepository/DownloadRootsAndCRL.aspx and translated to .pem following instructions in https://incognitjoe.github.io/adding-certs-to-requests.html
haduk_downloader_functions.addCertificate(cert_location)
s = haduk_downloader_functions.startClient(data,site_url)


"""Iterate over years
"""

for period_text in tqdm(dates_series):
file_url = file_url_base + file_location + period_text + '.nc'
req = s.get(file_url)
with open(os.path.join(output_folder,file_name + period_text + '.nc'),'wb') as f:
f.write(req.content)
53 changes: 53 additions & 0 deletions scripts/haduk_downloader_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 5 09:03:38 2019
@author: bdobson
"""
import requests
import pandas as pd
import certifi

def startClient(data,site_url):
"""Log in to the ceda data store
"""
s = requests.Session()
s.get(site_url)
csrftoken = s.cookies['csrftoken']
data['csrfmiddlewaretoken']=csrftoken
r = s.post(site_url,data=data,headers=dict(Referer=site_url))
# print(r.content)
return s

def createDateseries(period,start_year,end_year):
end_year += 1
if period == 'day':
dates = pd.date_range(pd.to_datetime(start_year*100+1,format='%Y%m'),
pd.to_datetime(end_year*100+1,format='%Y%m'),
freq='m')
dates_series = dates.strftime('%Y%m') + '01-' + dates.strftime('%Y%m%d')
else:
dates = pd.date_range(pd.to_datetime(start_year*100+1,format='%Y%m'),
pd.to_datetime(end_year*100+1,format='%Y%m'),
freq='Y')
dates_series = dates.strftime('%Y') + '01-' + dates.strftime('%Y%m')
return dates_series

def getFilename(variable,grid_scale,period):
return '%s_hadukgrid_uk_%dkm_%s_' % (variable,
grid_scale,
period)

def getFileweblocation(grid_scale,variable,period,version):
return '%dkm/%s/%s/%s/%s' % (grid_scale,
variable,
period,
version,
getFilename(variable,grid_scale,period))

def addCertificate(cert_location):
cafile = certifi.where()
with open(cert_location,'rb') as infile:
customca = infile.read()
with open(cafile,'ab') as outfile:
outfile.write(customca)
87 changes: 87 additions & 0 deletions scripts/load_and_aggregate_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""
This is a script to aggregate hadUK data, written by Barnaby Dobson, 5th, Dec, 2019
Data must have already been downloaded (see download_haduk_data.py)
"""

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import xarray as xr
import os
import haduk_downloader_functions
from tqdm import tqdm

"""Enter addresses and input data
"""
shp_address = os.path.join("C:\\","Users","bdobson","Downloads","example.shp")
data_address = os.path.join("C:\\","Users","bdobson","Downloads","")
output_address = os.path.join("C:\\","Users","bdobson","Downloads","")

start_year = 1975
end_year = 1980
grid_scale = 25 # km
period = 'day' #'day', 'mon' or 'ann'
variable = 'rainfall' #tested 'rainfall' and 'tas' (temperature)

name = 'zone_name' # the 'name' column in the .shp file (or whatever column you want the output files to be named)

"""Load shapefile and create file names
"""
shp = gpd.read_file(shp_address).reset_index()

dates_series = haduk_downloader_functions.createDateseries(period,start_year,end_year)

file_name = haduk_downloader_functions.getFilename(variable,grid_scale,period)

"""Load data (needs netcdf4 installed)
"""
data = []
for period_text in tqdm(dates_series):
ds = xr.open_dataset(data_address + file_name + period_text + '.nc')
df = ds.to_dataframe()
data.append(df)
data = pd.concat(data)

"""Remove grid points that have no data
"""
ind = data[variable] < 100000000
data = data.loc[ind].reset_index()

"""Assign grid points to the different shapes in the shapefile (this is slow)
"""
print('Assigning grid points')
grid_points = data[['latitude','longitude']].drop_duplicates()
points = []
for idx in tqdm(grid_points.index):
row = grid_points.loc[idx]
ind = np.where((data.latitude == row.latitude) & (data.longitude == row.longitude))
points.append({'lon':row.longitude,'lat':row.latitude,'data_rows':ind,'index':str(idx),'geometry':Point(row.longitude,row.latitude)})
points = gpd.GeoDataFrame(points)
points = points.set_index('index')
points.crs = {'init' :'epsg:4326'}
points = gpd.tools.sjoin(points,shp,how='left')

"""Aggregate data and print to csv
"""
for i in shp['index']:
point_ind = points[name] == shp.loc[i,name]
points.loc[point_ind,'data_rows']
in_geom = np.concatenate(np.concatenate(points.loc[point_ind,'data_rows']))

data_in_geom = data.iloc[in_geom].reset_index()
data_in_geom = data_in_geom.groupby('time').mean() # Average over all points inside a shape
data_in_geom['date'] = data_in_geom.index.date.tolist()
data_in_geom = data_in_geom[['date',variable]]

iname = shp.loc[shp['index']==i,name].iloc[0]

data_in_geom.to_csv(os.path.join(output_address,
'_'.join([iname,
variable,
str(start_year),
str(end_year),
period,
str(grid_scale),
'km'])+".csv"),
sep=',',index=False)

0 comments on commit a4c68c8

Please sign in to comment.