-
Notifications
You must be signed in to change notification settings - Fork 0
/
calculate_emissions_functions.py
483 lines (405 loc) · 20 KB
/
calculate_emissions_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 25 14:31:51 2023
@author: geolki
updated by Julia Crook (CEMAC) to optimize the code by only reading what is necessary from csv files
and by computing footprint using the components rather than a big matrix
I have added read_config to get the required parameters to run either on Large data or Small data.
Also it makes the code more configurable in where the files are stored.
"""
import pandas as pd
import numpy as np
#import scipy.linalg as sla
#-----------------------------------------
# to handle memory profiling line by line
#-----------------------------------------
import os
mprof=os.environ['MPROFILE_Lbl']
print('MPROFILE_Lbl', mprof)
if mprof==1 or mprof=='1':
print('cef: importing mprofiler')
from memory_profiler import profile
else:
from profile import *
#----------------------------------------------------------------
# read config file to get indir, outdir and filenames which are different
# for small and large data
#----------------------------------------------------------------
def read_config(cfname):
cf = open(cfname, "r")
lines=cf.readlines()
if len(lines)<7:
print('invalid config file')
raise SystemExit
# need to remove '\n' from line
indir=lines[0][:-1]
outdir=lines[1][:-1]
labels_fname=lines[2][:-1]
lookup_fname=lines[3][:-1]
Z_fname=lines[4][:-1]
Y_fname=lines[5][:-1]
co2_fname=lines[6][:-1]
cf.close()
return indir, outdir, labels_fname, lookup_fname, Z_fname, Y_fname, co2_fname
#######################################################################################
## Gloria Metadata
## The next two funtions are for reading the lookup and labels excel files to determine
## the multiIndices for Z and Y
#######################################################################################
#----------------------------------------------------------------
# this function will split the strings into a list of <country>+ (<code>),
# and <some_str>, where <some_str> is the remainder of the string
# inputs:
# labels is a list of strings containing <country> (<code>) <some_str>
# note that the string before the code may also contain ()
# valid_country_codes is a list of valid country codes
# returns:
# country - list of <country>
# country_code - list of <code>
# remainder - list of <some_str>
#------------------------------------------------------
def split_country_and_code(labels, valid_country_codes):
nl=len(labels)
country = ['NA']*nl
country_code=['']*nl
remainder=['']*nl
n=0
for cs in labels:
items=cs.split('(')
# find which part has the country code
for item in items:
items2=item.split(')')
if items2[0] in valid_country_codes:
country_code[n]=items2[0]
# country is the string before the country code and remainder is the part after
items3=cs.split('('+items2[0]+')')
country[n]=items3[0]
remainder[n]=items3[1]
if country[n]=='NA':
print('Could not find country code in', cs)
n+=1
# end code if there is a mismatch in labels, this is done to test that the code does what it should
if 'NA' in country:
print('Error: Missing country labels')
raise SystemExit
return country, country_code, remainder
#----------------------------------------------------------------
# This function reads the excel files containing the indices in Z
# mrio_filepath is the directory where they exist
# labels_fname is the filename for the labels file
# lookup_fname is the filename for the lookup file
#----------------------------------------------------------------
@profile
def get_metadata_indices(mrio_filepath, labels_fname, lookup_fname):
# read metadata which is used for column and row labels later on
readme = mrio_filepath + labels_fname
labels = pd.read_excel(readme, sheet_name=None)
# get lookup to fix labels
lookup = pd.read_excel(mrio_filepath + lookup_fname, sheet_name=None)
# get list of countries in dataset
lookup['countries'] = lookup['countries'][['gloria', 'gloria_code']].drop_duplicates().dropna()
lookup['countries']['gloria_combo'] = lookup['countries']['gloria'] + ' (' + lookup['countries']['gloria_code'] + ') '
# get list of sectors in dataset
lookup['sectors'] = lookup['sectors'][['gloria']].drop_duplicates().dropna()
# fix Z labels
# This helps beng able to split the 'country' from the 'sector' component in the label later on
# Essentially this removes some special characters from the country names, making it easier to split the country from the secro by a specific charater later on
t_cats = pd.DataFrame(labels['Sequential region-sector labels']['Sequential_regionSector_labels']).drop_duplicates(); t_cats.columns = ['label']
# remove special characters frm country names -JAC this seems to be only removing sector
# JAC replace loop with call to function
valid_country_codes=lookup['countries']['gloria_code'].tolist()
country, country_code, remainder=split_country_and_code(t_cats['label'], valid_country_codes)
#t_cats['country'] = country # dont really need this as not used
t_cats['country_code'] = country_code
t_cats['sector'] = remainder
# fix final demand labels (this is in the Y dataframe later)
# it follows the same logic as the Z label fix above
fd_cats = pd.DataFrame(labels['Sequential region-sector labels']['Sequential_finalDemand_labels'].dropna(how='all', axis=0)); fd_cats.columns = ['label']
# remove special characters frm country names
# JAC replace loop below with call to function
country, country_code, remainder=split_country_and_code(fd_cats['label'], valid_country_codes)
#fd_cats['country'] = country # dont really need this as not used
fd_cats['country_code'] = country_code
fd_cats['fd'] = remainder
# split lables by industries vs products (these later get split into the S and U dataframes)
t_cats['ind'] = t_cats['label'].str[-8:]
industries = t_cats.loc[t_cats['ind'] == 'industry']
products = t_cats.loc[t_cats['ind'] == ' product']
sector_type=np.asarray(t_cats['ind'])
iix=np.where(sector_type=='industry')[0]
pix=np.where(sector_type==' product')[0]
# make index labels
z_idx = pd.MultiIndex.from_arrays([t_cats['country_code'], t_cats['sector']]) # labels for Z dataframe
industry_idx = pd.MultiIndex.from_arrays([industries['country_code'], industries['sector']]) # labels used to split Z into S and U
product_idx = pd.MultiIndex.from_arrays([products['country_code'], products['sector']]) # labels used to split Z into S and U
y_cols = pd.MultiIndex.from_arrays([fd_cats['country_code'], fd_cats['fd']]) # labels for Y dataframe
sat_rows = labels['Satellites']['Sat_indicator'] # labels for CO2 dataframe
# clear space in variable explorer to free up memory
# del t_cats, temp_c, cs, a, c, temp_s, i, temp, fd_cats, industries, products, item
# running the code to this point normally takes around 15 seconds
return z_idx, industry_idx, product_idx, iix,pix, y_cols, sat_rows
###########################################################################################
## Reading Data
## I now have 2 versions of the code that reads the data and processes into the footprint
## The old version that uses big matrices and a new version that just uses the components
## that are actually useful, ie non zero
## the read_data_new/old do the following:
## reads S and U from Z (z_filepath)
## reads the product rows from Y (y_filepath)
## read the industry columns and stressor_row from co2_filepath.
##
## In read_data_new, only load required rows and cols from everything
## There are 2 version of cef.indirect_footprint_SUT to calcluate the footprint
## cef.indirect_footprint_SUT_new calculates the footprint using only the data given
## without constructing big matrices again
###########################################################################################
#--------------------------------------------------------------------
# function read_data_new
# This reads only the relevant rows and columns from Z, Y and co2file
# this is the new way to get the data using much less memory
# inputs:
# z_filepath - the filepath from where to read the big matrix Z
# y_filepath - the filepath from where to read Y
# co2_filepath - the filepath from where to read the stressor data
# iix - these are the indices (0,1....) where the industry rows/columns are
# pix - these are the indices (0,1...) where the product rows/columns are
# industry_idx - this is the multiIndex to set in S/U/stressor
# product_idx - this is the multiIndex to set in S/U/Y
# y_cols - the multiIndex to set in Y columns
# stressor_row - the row to pick out from co2_filepath
# returns:
# S - the [industry_idx, product_idx] part of Z
# U - the [product_idx, industry_idx] part of Z
# Y - the product_idx rows of Y
# stressor - the [stressor_row, industry_idx] part of co2_filepath (NB this is a single row)
#--------------------------------------------------------------------
@profile
def read_data_new(z_filepath, y_filepath, co2_filepath, iix, pix, industry_idx, product_idx, y_cols, stressor_row):
# read S and U directly from Z by specifying specific rows and cols
S=pd.read_csv(z_filepath, header=None, index_col=None, skiprows = lambda x: x not in iix, usecols=pix)
S.index=industry_idx; S.columns=product_idx
U=pd.read_csv(z_filepath, header=None, index_col=None,skiprows = lambda x: x not in pix, usecols=iix)
U.index=product_idx; U.columns=industry_idx
# read product rows of Y and rename index and column
# JAC just read the required rows directly and set up the indices for the columns
Y = pd.read_csv(y_filepath, header=None, index_col=None, skiprows = lambda x: x not in pix)
Y.index=product_idx; Y.columns=y_cols
# import stressor (co2) data
# JAC just read the required row and columns directly from the csv
stressor = pd.read_csv(co2_filepath, header=None, index_col=None, nrows=1, skiprows=stressor_row, usecols=iix)
stressor.columns=industry_idx
return S, U, Y, stressor
#--------------------------------------------------------------------
# function read_data_old
# This reads the relevant rows and columns from Z, Y and co2file by reading
# the whole lot and then selecting the required rows/columns
# this is the old way this worked
# inputs:
# z_filepath - the filepath from where to read the big matrix Z
# y_filepath - the filepath from where to read Y
# co2_filepath - the filepath from where to read the stressor data
# z_idx - this is the multiIndex for the whole of Z
# industry_idx - this is the multiIndex to select the relevant parts for S/U
# product_idx - this is the multiIndex to select the relevants parts for S/U
# y_cols - the multiIndex to set in Y columns
# sat_rows - the indices of all the rows of stressor in co2_filepath
# stressor_cat - used to find the particular row needed from co2_filepath
# returns:
# S - the [industry_idx, product_idx] part of Z
# U - the [product_idx, industry_idx] part of Z
# Y - the product_idx rows of Y
# stressor - the [stressor_row, industry_idx] part of co2_filepath (NB this is a single row)
#--------------------------------------------------------------------
@profile
def read_data_old(z_filepath,y_filepath,co2_filepath,z_idx,industry_idx, product_idx, y_cols, sat_rows, stressor_cat):
# import Z file to make S and U tables
# This is not necessar, but it then follows the same structure as the other datasets we use, which is why this is currenlty done
# Mainly splitting the Z here just allows us to use a single function for multiple datasets, from what I can tell this is not the part taking super long, so I have kept it like this
# but it's possible that just making a new function for the Gloria data would be better than reorganising this data twice as
# S and U are later combined into Z again, but with slightly different index and column order
Z = pd.read_csv(z_filepath, header=None, index_col=None)
Z.index = z_idx; Z.columns = z_idx
S = Z.loc[industry_idx, product_idx]
U = Z.loc[product_idx, industry_idx]
del Z # remove Z to clear memory
# import Y and rename index and column
# again, it's matched to the strutcure of other datasets we analyse
Y = pd.read_csv(y_filepath, header=None, index_col=None)
Y.index = z_idx; Y.columns = y_cols
Y = Y.loc[product_idx]
# import stressor (co2) data
# again, it's matched to the structure of other datasets we analyse
stressor = pd.read_csv(co2_filepath, header=None, index_col=None)
stressor.index = sat_rows; stressor.columns = z_idx
stressor = stressor.loc[stressor_cat, industry_idx]
return S, U, Y, stressor
###########################################################################################
## Gloria Emissions calculations
## The following functions are for calcluating the footprint
###########################################################################################
@profile
def make_x(Z, Y, verbose):
x = np.sum(Z, 1)+np.sum(Y, 1)
x[x == 0] = 0.000000001
if verbose:
print("DBG: X shape is ", x.shape)
return x
# equivalent function of make_x but does it as components
@profile
def make_x_comp_new(S, U, Y, verbose):
# components of what was x
sumS=np.sum(S,1) # this is x1
sumS[sumS == 0] = 0.000000001 # do this so dividing by 0 does not happen
sumU=np.sum(U,1)
sumY=np.sum(Y,1)
sumUY=sumU+sumY # this is x2
sumUY[sumUY==0] = 0.000000001 # do this so dividing by 0 does not happen
if verbose:
print('DBG: sumS, sumUY shape is ', sumS.shape, sumUY.shape)
return sumS, sumUY
@profile
def make_L(Z, x, verbose):
bigX = np.zeros(shape = (len(Z)))
bigX = np.tile(np.transpose(x), (len(Z), 1))
A = np.divide(Z, bigX)
#np.save('A_old.npy', A)
I_minus_A=np.identity(len(Z))-A
L = np.linalg.inv(I_minus_A)
#np.save('L_old.npy', L)
if verbose:
print('DBG: bigX shape is ', bigX.shape)
print('DBG: A shape is ', A.shape)
print('DBG: L shape is ', L.shape)
return L
# equivalent of make_L but does it as components
@profile
def make_L_comp_new(S, U, sumS, sumUY, verbose):
bigSumS = np.tile(np.transpose(sumS), (S.shape[0],1))
bigSumUY = np.tile(np.transpose(sumUY), (U.shape[0],1))
# use elementwise divide as was done in make_L to get A
Snorm=np.divide(S,bigSumUY)
Unorm=np.divide(U,bigSumS)
#np.save('Snorm.npy', Snorm)
#np.save('Unorm.npy', Unorm)
# in equation [I-A]X=D where I-A top is [I,-Snorm] and I-A bottom is [-Unorm, I], Dtop is 0 and Dbottom is Y
# assume X has X1 as the top part and X2 as the bottom part from which we get
# 1. X1-Snorm.X2 = 0 and
# 2. -Unorm.X1 + X2 = Y
#
# from 1. we get 3. X1=Snorm.X2
# insert into 2. X2 - Unorm.(Snorm.X2) = Y
# so (I-Unorm.Snorm).X2 = Y
# so X2 = inv(I-Unorm.Snorm).Y
# If we say L has components [Ltl, Ltr]
# [Lbl, Lbr]
# then Ltl.0+Ltr.Y=X1, i.e. Ltr.Y=X1=Snorm.X2=Snorm.inv(I-Unorm.Snorm).Y
# so Ltr=Snorm.inv(I-Unorm.Snorm)
# and Lbl.0+Lbr.Y=X2=inv(I-Unorm.Snorm).Y
# so Lbr=inv(I-Unorm.Snorm)
# we cannot say anything about Ltl and Lbl from these equations
# we will be using L to do e.L where the bottom part of e is 0. This means we will need Ltl and Ltr
# We also know that [Ltl, Ltr] [I, -Snorm] = [I, 0]
# [Lbl, Lbr] [-Unorm, I] [0, I]
# so Ltl-Ltr.Unorm=I, ie Ltl =I+Ltr.Unorm
# and Lbl-Lbr.Unorm=0 ie Lbl=Lbr.Unorm
I=np.identity(S.shape[0])
L=np.linalg.inv(I-np.matmul(Unorm,Snorm))
# use sci version - faster? Test this on big data on machine with multiple cores
#L = sla.inv(I-np.matmul(Unorm, Snorm))
#np.save('L_new.npy', L)
if verbose:
print('DBG: Unorm and Snorm shape', Unorm.shape, Snorm.shape)
print('DBG: L shape is ', L.shape)
# As we are going to do e.L where the second half of e is 0 we only need the Ltl and Ltr components
# but then we are going to multiply by Y where Y is 0 in top half so we only need Ltr
Ltr=np.dot(Snorm, L)
#Ltl=I+np.dot(Ltr, Unorm)
return Ltr
def make_e(stressor, x):
# MRI not used in this model for some reason
e = np.zeros(shape = (1, np.size(x)))
e[0, 0:np.size(stressor)] = np.transpose(stressor)
e = e/x
@profile
def make_Z_from_S_U(S, U, verbose):
Z = np.zeros(shape = (np.size(S, 0)+np.size(U, 0), np.size(S, 1)+np.size(U, 1)))
Z[np.size(S, 0):, 0:np.size(U, 1)] = U
Z[0:np.size(S, 0), np.size(U, 1):] = S
if verbose:
print('DBG: make Z from S and U', Z.size, Z.shape )
return Z
# there is no equivalent of make_Z_from_S_U for the new way as we just work with S and U
# I have pulled out the for loop that creates the footprint so I can time it
@profile
def calculate_footprint(bigY, eL, y_cols, su_idx, u_cols):
footprint = np.zeros(shape = bigY.shape).T
for a in range(np.size(bigY, 1)):
footprint[a] = np.dot(eL, np.diag(bigY[:, a]))
old_shape=footprint.shape
footprint = pd.DataFrame(footprint, index=y_cols, columns=su_idx)
footprint = footprint[u_cols]
return footprint
@profile
def indirect_footprint_SUT(S, U, Y, stressor, verbose):
# make column names
s_cols = S.columns.tolist()
u_cols = U.columns.tolist()
su_idx = pd.MultiIndex.from_arrays([[x[0] for x in s_cols] + [x[0] for x in u_cols],
[x[1] for x in s_cols] + [x[1] for x in u_cols]])
y_cols = Y.columns
# calculate emissions
Z = make_Z_from_S_U(S, U,verbose)
# clear memory
del S, U
bigY = np.zeros(shape = [np.size(Y, 0)*2, np.size(Y, 1)])
bigY[np.size(Y, 0):np.size(Y, 0)*2, 0:] = Y
x = make_x(Z, bigY,verbose)
L = make_L(Z, x, verbose)
bigstressor = np.zeros(shape = [np.size(Y, 0)*2, 1])
bigstressor[:np.size(Y, 0), 0] = np.array(stressor)
e = np.sum(bigstressor, 1)/x
#np.save('e_old.npy', e)
eL = np.dot(e, L)
#np.save('eL_old.npy', eL)
if verbose:
print('DBG: bigY shape', bigY.shape)
print('DBG: e shape is ', e.shape, 'big_stressor is ', bigstressor.shape)
print('DBG: eL shape is ', eL.shape)
footprint = calculate_footprint(bigY, eL, y_cols, su_idx, u_cols)
if verbose:
print('DBG: footprint shape is', footprint.shape)
return footprint
# I have pulled out the calculation of the footprint so I can time it
@profile
def calculate_footprint_new(eL2,Y,y_cols,u_cols):
# for each column in Y the code used to take the diagonal of bigY to find the dot product with eL
# as bigY was 0 in the top half, only the bottom half of eL would have been valid
# therefore we only need to use the eL2 part.
Y2=Y.to_numpy()
footprint=np.asarray([eL2*Y2[:,a] for a in range(Y.shape[1])])
footprint = pd.DataFrame(footprint, index=y_cols, columns=u_cols)
return footprint
# equivalent of indirect_footprint_SUT but does it as components
@profile
def indirect_footprint_SUT_new(S, U, Y, stressor,verbose):
# calculate emissions
sumS, sumUY=make_x_comp_new(S,U,Y,verbose)
# stressor has 1 row also may be different indexing which messes up np.divide so just look at array
stress=stressor.to_numpy()[0,:]
e1=np.divide(stress, sumS) # lower part of e is 0 as bigstressor only had stressor in top part
e2=0
if verbose:
print('DBG: e1 shape', e1.shape)
#np.save('e1_new.npy', e1)
Ltr = make_L_comp_new(S, U, sumS, sumUY, verbose)
#eL1=np.dot(e1,Ltl)
eL2=np.dot(e1,Ltr)
if verbose:
print('DBG: Ltr shape', Ltr.shape)
#np.save('eL2_new.npy', eL2)
y_cols = Y.columns
u_cols=U.columns
footprint=calculate_footprint_new(eL2,Y,y_cols,u_cols)
if verbose:
print('DBG: footprint shape is',footprint.shape)
return footprint