From a4c68c8faedb77b431296f474fc1af43f9c55444 Mon Sep 17 00:00:00 2001
From: barneydobson <barnaby.dobson1@gmail.com>
Date: Tue, 17 Dec 2019 10:04:13 +0000
Subject: [PATCH] Initial commit

---
 scripts/download_haduk_data.py        | 61 +++++++++++++++++++
 scripts/haduk_downloader_functions.py | 53 ++++++++++++++++
 scripts/load_and_aggregate_data.py    | 87 +++++++++++++++++++++++++++
 3 files changed, 201 insertions(+)
 create mode 100644 scripts/download_haduk_data.py
 create mode 100644 scripts/haduk_downloader_functions.py
 create mode 100644 scripts/load_and_aggregate_data.py

diff --git a/scripts/download_haduk_data.py b/scripts/download_haduk_data.py
new file mode 100644
index 0000000..7f1d978
--- /dev/null
+++ b/scripts/download_haduk_data.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+"""
+This is a script to download hadUK data, written by Barnaby Dobson, 5th, Dec, 2019
+"""
+
+import os
+from tqdm import tqdm
+import haduk_downloader_functions
+
+"""User data
+This includes, username, password, output location and details for the download
+"""
+
+data={'username':'USER_NAME',
+      'password':'PASSWORD'} # create an account here: https://services.ceda.ac.uk/cedasite/register/info/
+
+
+output_folder = os.path.join("C:\\","Users","bdobson","Downloads","")
+start_year = 1975
+end_year = 1980 # Up to but not including the end_year
+grid_scale = 25 # km
+period = 'day' #'day', 'mon' or 'ann'
+variable = 'rainfall' #tested 'rainfall' and 'tas' (temperature)
+
+
+"""Website information
+Give site and file urls
+Create dates and file names
+"""
+
+site_url = 'https://auth.ceda.ac.uk/account/signin/' # login page
+
+file_url_base = 'http://dap.ceda.ac.uk/thredds/fileServer/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.0.1.0/' # hadUK data base address
+version = 'v20190808' # the latest release I suppose
+
+dates_series = haduk_downloader_functions.createDateseries(period,start_year,end_year)
+
+file_name = haduk_downloader_functions.getFilename(variable,grid_scale,period)
+
+file_location = haduk_downloader_functions.getFileweblocation(grid_scale,variable,period,version)
+
+
+"""Login
+"""
+
+try:
+   s = haduk_downloader_functions.startClient(data,site_url)
+except:
+    cert_location = 'qvevsslg3.pem' # Available from https://www.quovadisglobal.com/QVRepository/DownloadRootsAndCRL.aspx and translated to .pem following instructions in https://incognitjoe.github.io/adding-certs-to-requests.html
+    haduk_downloader_functions.addCertificate(cert_location)
+    s = haduk_downloader_functions.startClient(data,site_url)
+
+
+"""Iterate over years
+"""
+
+for period_text in tqdm(dates_series):
+    file_url = file_url_base + file_location + period_text + '.nc'
+    req = s.get(file_url)
+    with open(os.path.join(output_folder,file_name + period_text + '.nc'),'wb') as f:
+        f.write(req.content)
diff --git a/scripts/haduk_downloader_functions.py b/scripts/haduk_downloader_functions.py
new file mode 100644
index 0000000..138172f
--- /dev/null
+++ b/scripts/haduk_downloader_functions.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Dec  5 09:03:38 2019
+
+@author: bdobson
+"""
+import requests
+import pandas as pd
+import certifi
+
+def startClient(data,site_url):
+    """Log in to the ceda data store
+    """
+    s = requests.Session()
+    s.get(site_url)
+    csrftoken = s.cookies['csrftoken']
+    data['csrfmiddlewaretoken']=csrftoken
+    r = s.post(site_url,data=data,headers=dict(Referer=site_url))
+#    print(r.content)
+    return s
+
+def createDateseries(period,start_year,end_year):
+    end_year += 1
+    if period == 'day':
+        dates = pd.date_range(pd.to_datetime(start_year*100+1,format='%Y%m'),
+                              pd.to_datetime(end_year*100+1,format='%Y%m'),
+                              freq='m')
+        dates_series = dates.strftime('%Y%m') + '01-' + dates.strftime('%Y%m%d')
+    else:
+        dates = pd.date_range(pd.to_datetime(start_year*100+1,format='%Y%m'),
+                              pd.to_datetime(end_year*100+1,format='%Y%m'),
+                              freq='Y')
+        dates_series = dates.strftime('%Y') + '01-' + dates.strftime('%Y%m')
+    return dates_series
+
+def getFilename(variable,grid_scale,period):
+    return '%s_hadukgrid_uk_%dkm_%s_' % (variable,
+                                         grid_scale,
+                                         period)
+
+def getFileweblocation(grid_scale,variable,period,version):
+    return '%dkm/%s/%s/%s/%s' % (grid_scale,
+                                         variable,
+                                         period,
+                                         version,
+                                         getFilename(variable,grid_scale,period))
+    
+def addCertificate(cert_location):
+    cafile = certifi.where()
+    with open(cert_location,'rb') as infile: 
+        customca = infile.read()
+    with open(cafile,'ab') as outfile:
+        outfile.write(customca)
diff --git a/scripts/load_and_aggregate_data.py b/scripts/load_and_aggregate_data.py
new file mode 100644
index 0000000..e95176e
--- /dev/null
+++ b/scripts/load_and_aggregate_data.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+"""
+This is a script to aggregate hadUK data, written by Barnaby Dobson, 5th, Dec, 2019
+Data must have already been downloaded (see download_haduk_data.py)
+"""
+
+import pandas as pd
+import geopandas as gpd
+from shapely.geometry import Point
+import xarray as xr
+import os
+import haduk_downloader_functions
+from tqdm import tqdm
+
+"""Enter addresses and input data
+"""
+shp_address = os.path.join("C:\\","Users","bdobson","Downloads","example.shp")
+data_address = os.path.join("C:\\","Users","bdobson","Downloads","")
+output_address =  os.path.join("C:\\","Users","bdobson","Downloads","")
+
+start_year = 1975
+end_year = 1980
+grid_scale = 25 # km
+period = 'day' #'day', 'mon' or 'ann'
+variable = 'rainfall' #tested 'rainfall' and 'tas' (temperature)
+
+name = 'zone_name' # the 'name' column in the .shp file (or whatever column you want the output files to be named)
+
+"""Load shapefile and create file names
+"""
+shp = gpd.read_file(shp_address).reset_index()
+
+dates_series = haduk_downloader_functions.createDateseries(period,start_year,end_year)
+
+file_name = haduk_downloader_functions.getFilename(variable,grid_scale,period)
+
+"""Load data (needs netcdf4 installed)
+"""
+data = []
+for period_text in tqdm(dates_series):
+    ds = xr.open_dataset(data_address + file_name + period_text + '.nc')
+    df = ds.to_dataframe()
+    data.append(df)
+data = pd.concat(data)
+
+"""Remove grid points that have no data
+"""
+ind = data[variable] < 100000000
+data = data.loc[ind].reset_index()
+
+"""Assign grid points to the different shapes in the shapefile (this is slow)
+"""
+print('Assigning grid points')
+grid_points = data[['latitude','longitude']].drop_duplicates()
+points = []
+for idx in tqdm(grid_points.index):
+    row = grid_points.loc[idx]
+    ind = np.where((data.latitude == row.latitude) & (data.longitude == row.longitude))
+    points.append({'lon':row.longitude,'lat':row.latitude,'data_rows':ind,'index':str(idx),'geometry':Point(row.longitude,row.latitude)})
+points = gpd.GeoDataFrame(points)
+points = points.set_index('index')
+points.crs = {'init' :'epsg:4326'}
+points = gpd.tools.sjoin(points,shp,how='left')
+
+"""Aggregate data and print to csv
+"""
+for i in shp['index']:
+    point_ind = points[name] == shp.loc[i,name]
+    points.loc[point_ind,'data_rows']
+    in_geom = np.concatenate(np.concatenate(points.loc[point_ind,'data_rows']))
+    
+    data_in_geom = data.iloc[in_geom].reset_index()
+    data_in_geom = data_in_geom.groupby('time').mean() # Average over all points inside a shape
+    data_in_geom['date'] = data_in_geom.index.date.tolist()
+    data_in_geom = data_in_geom[['date',variable]]
+    
+    iname = shp.loc[shp['index']==i,name].iloc[0]
+    
+    data_in_geom.to_csv(os.path.join(output_address,
+                                     '_'.join([iname,
+                                               variable,
+                                               str(start_year),
+                                               str(end_year),
+                                               period,
+                                               str(grid_scale),
+                                               'km'])+".csv"),
+                            sep=',',index=False)