From a4c68c8faedb77b431296f474fc1af43f9c55444 Mon Sep 17 00:00:00 2001 From: barneydobson Date: Tue, 17 Dec 2019 10:04:13 +0000 Subject: [PATCH] Initial commit --- scripts/download_haduk_data.py | 61 +++++++++++++++++++ scripts/haduk_downloader_functions.py | 53 ++++++++++++++++ scripts/load_and_aggregate_data.py | 87 +++++++++++++++++++++++++++ 3 files changed, 201 insertions(+) create mode 100644 scripts/download_haduk_data.py create mode 100644 scripts/haduk_downloader_functions.py create mode 100644 scripts/load_and_aggregate_data.py diff --git a/scripts/download_haduk_data.py b/scripts/download_haduk_data.py new file mode 100644 index 0000000..7f1d978 --- /dev/null +++ b/scripts/download_haduk_data.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +""" +This is a script to download hadUK data, written by Barnaby Dobson, 5th, Dec, 2019 +""" + +import os +from tqdm import tqdm +import haduk_downloader_functions + +"""User data +This includes, username, password, output location and details for the download +""" + +data={'username':'USER_NAME', + 'password':'PASSWORD'} # create an account here: https://services.ceda.ac.uk/cedasite/register/info/ + + +output_folder = os.path.join("C:\\","Users","bdobson","Downloads","") +start_year = 1975 +end_year = 1980 # Up to but not including the end_year +grid_scale = 25 # km +period = 'day' #'day', 'mon' or 'ann' +variable = 'rainfall' #tested 'rainfall' and 'tas' (temperature) + + +"""Website information +Give site and file urls +Create dates and file names +""" + +site_url = 'https://auth.ceda.ac.uk/account/signin/' # login page + +file_url_base = 'http://dap.ceda.ac.uk/thredds/fileServer/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.0.1.0/' # hadUK data base address +version = 'v20190808' # the latest release I suppose + +dates_series = haduk_downloader_functions.createDateseries(period,start_year,end_year) + +file_name = haduk_downloader_functions.getFilename(variable,grid_scale,period) + +file_location = haduk_downloader_functions.getFileweblocation(grid_scale,variable,period,version) + + +"""Login +""" + +try: + s = haduk_downloader_functions.startClient(data,site_url) +except: + cert_location = 'qvevsslg3.pem' # Available from https://www.quovadisglobal.com/QVRepository/DownloadRootsAndCRL.aspx and translated to .pem following instructions in https://incognitjoe.github.io/adding-certs-to-requests.html + haduk_downloader_functions.addCertificate(cert_location) + s = haduk_downloader_functions.startClient(data,site_url) + + +"""Iterate over years +""" + +for period_text in tqdm(dates_series): + file_url = file_url_base + file_location + period_text + '.nc' + req = s.get(file_url) + with open(os.path.join(output_folder,file_name + period_text + '.nc'),'wb') as f: + f.write(req.content) diff --git a/scripts/haduk_downloader_functions.py b/scripts/haduk_downloader_functions.py new file mode 100644 index 0000000..138172f --- /dev/null +++ b/scripts/haduk_downloader_functions.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Dec 5 09:03:38 2019 + +@author: bdobson +""" +import requests +import pandas as pd +import certifi + +def startClient(data,site_url): + """Log in to the ceda data store + """ + s = requests.Session() + s.get(site_url) + csrftoken = s.cookies['csrftoken'] + data['csrfmiddlewaretoken']=csrftoken + r = s.post(site_url,data=data,headers=dict(Referer=site_url)) +# print(r.content) + return s + +def createDateseries(period,start_year,end_year): + end_year += 1 + if period == 'day': + dates = pd.date_range(pd.to_datetime(start_year*100+1,format='%Y%m'), + pd.to_datetime(end_year*100+1,format='%Y%m'), + freq='m') + dates_series = dates.strftime('%Y%m') + '01-' + dates.strftime('%Y%m%d') + else: + dates = pd.date_range(pd.to_datetime(start_year*100+1,format='%Y%m'), + pd.to_datetime(end_year*100+1,format='%Y%m'), + freq='Y') + dates_series = dates.strftime('%Y') + '01-' + dates.strftime('%Y%m') + return dates_series + +def getFilename(variable,grid_scale,period): + return '%s_hadukgrid_uk_%dkm_%s_' % (variable, + grid_scale, + period) + +def getFileweblocation(grid_scale,variable,period,version): + return '%dkm/%s/%s/%s/%s' % (grid_scale, + variable, + period, + version, + getFilename(variable,grid_scale,period)) + +def addCertificate(cert_location): + cafile = certifi.where() + with open(cert_location,'rb') as infile: + customca = infile.read() + with open(cafile,'ab') as outfile: + outfile.write(customca) diff --git a/scripts/load_and_aggregate_data.py b/scripts/load_and_aggregate_data.py new file mode 100644 index 0000000..e95176e --- /dev/null +++ b/scripts/load_and_aggregate_data.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +""" +This is a script to aggregate hadUK data, written by Barnaby Dobson, 5th, Dec, 2019 +Data must have already been downloaded (see download_haduk_data.py) +""" + +import pandas as pd +import geopandas as gpd +from shapely.geometry import Point +import xarray as xr +import os +import haduk_downloader_functions +from tqdm import tqdm + +"""Enter addresses and input data +""" +shp_address = os.path.join("C:\\","Users","bdobson","Downloads","example.shp") +data_address = os.path.join("C:\\","Users","bdobson","Downloads","") +output_address = os.path.join("C:\\","Users","bdobson","Downloads","") + +start_year = 1975 +end_year = 1980 +grid_scale = 25 # km +period = 'day' #'day', 'mon' or 'ann' +variable = 'rainfall' #tested 'rainfall' and 'tas' (temperature) + +name = 'zone_name' # the 'name' column in the .shp file (or whatever column you want the output files to be named) + +"""Load shapefile and create file names +""" +shp = gpd.read_file(shp_address).reset_index() + +dates_series = haduk_downloader_functions.createDateseries(period,start_year,end_year) + +file_name = haduk_downloader_functions.getFilename(variable,grid_scale,period) + +"""Load data (needs netcdf4 installed) +""" +data = [] +for period_text in tqdm(dates_series): + ds = xr.open_dataset(data_address + file_name + period_text + '.nc') + df = ds.to_dataframe() + data.append(df) +data = pd.concat(data) + +"""Remove grid points that have no data +""" +ind = data[variable] < 100000000 +data = data.loc[ind].reset_index() + +"""Assign grid points to the different shapes in the shapefile (this is slow) +""" +print('Assigning grid points') +grid_points = data[['latitude','longitude']].drop_duplicates() +points = [] +for idx in tqdm(grid_points.index): + row = grid_points.loc[idx] + ind = np.where((data.latitude == row.latitude) & (data.longitude == row.longitude)) + points.append({'lon':row.longitude,'lat':row.latitude,'data_rows':ind,'index':str(idx),'geometry':Point(row.longitude,row.latitude)}) +points = gpd.GeoDataFrame(points) +points = points.set_index('index') +points.crs = {'init' :'epsg:4326'} +points = gpd.tools.sjoin(points,shp,how='left') + +"""Aggregate data and print to csv +""" +for i in shp['index']: + point_ind = points[name] == shp.loc[i,name] + points.loc[point_ind,'data_rows'] + in_geom = np.concatenate(np.concatenate(points.loc[point_ind,'data_rows'])) + + data_in_geom = data.iloc[in_geom].reset_index() + data_in_geom = data_in_geom.groupby('time').mean() # Average over all points inside a shape + data_in_geom['date'] = data_in_geom.index.date.tolist() + data_in_geom = data_in_geom[['date',variable]] + + iname = shp.loc[shp['index']==i,name].iloc[0] + + data_in_geom.to_csv(os.path.join(output_address, + '_'.join([iname, + variable, + str(start_year), + str(end_year), + period, + str(grid_scale), + 'km'])+".csv"), + sep=',',index=False)