-
Notifications
You must be signed in to change notification settings - Fork 0
/
bandit_experiment_state_knownP.py
89 lines (69 loc) · 3.07 KB
/
bandit_experiment_state_knownP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
'''
Script to run tabular experiments in batch mode.
author: [email protected]
'''
import numpy as np
import pandas as pd
import argparse
import sys
import environment
import finite_tabular_agents
from feature_extractor import FeatureTrueState
from experiment import run_finite_tabular_experiment
if __name__ == '__main__':
'''
Run a tabular experiment according to command line arguments
'''
# Take in command line flags
parser = argparse.ArgumentParser(description='Run tabular RL experiment')
parser.add_argument('stateMul', help='state multiplier', type=int)
parser.add_argument('gap', help='gap between best arm', type=float)
parser.add_argument('alg', help='Agent constructor', type=str)
parser.add_argument('scaling', help='scaling', type=float)
parser.add_argument('seed', help='random seed', type=int)
parser.add_argument('nEps', help='number of episodes', type=int)
args = parser.parse_args()
# Make a filename to identify flags
fileName = ('bandit'
+ '_stateMul=' + '%02.f' % args.stateMul
+ '_gap=' + '%04.3f' % args.gap
+ '_alg=' + str(args.alg)
+ '_scal=' + '%03.2f' % args.scaling
+ '_seed=' + str(args.seed)
+ '.csv')
folderName = './'
targetPath = folderName + fileName
print '******************************************************************'
print fileName
print '******************************************************************'
# Make the environment
env = environment.make_stateBanditMDP(stateMul=args.stateMul, gap=args.gap)
# Make the feature extractor
f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState)
# Make the agent
alg_dict = {'PSRL': finite_tabular_agents.PSRL,
'PSRLunif': finite_tabular_agents.PSRLunif,
'OptimisticPSRL': finite_tabular_agents.OptimisticPSRL,
'GaussianPSRL': finite_tabular_agents.GaussianPSRL,
'UCBVI': finite_tabular_agents.UCBVI,
'BEB': finite_tabular_agents.BEB,
'BOLT': finite_tabular_agents.BOLT,
'UCRL2': finite_tabular_agents.UCRL2,
'UCFH': finite_tabular_agents.UCFH,
'EpsilonGreedy': finite_tabular_agents.EpsilonGreedy}
agent_constructor = alg_dict[args.alg]
agent = agent_constructor(env.nState, env.nAction, env.epLen,
scaling=args.scaling)
# Letting the agent know the transitions, but not the rewards
agent.P_prior[0, 0] = 1e9 * (np.ones(nState) / (nState - 1))
agent.P_prior[0, 0][0] = 0
inds = (np.arange(nState) % 2) > 0
P_true[0, 1][inds] = 1e9 * ((0.6) / stateMul)
P_true[0, 1][-inds] = 1e9 * ((0.4) / stateMul)
P_true[0, 1][0] = 0
for a in range(env.nAction):
for s in range(1, env.nState):
agent.P_prior[s, a][s] += 1e9
# Run the experiment
run_finite_tabular_experiment(agent, env, f_ext, args.nEps, args.seed,
recFreq=1000, fileFreq=10000, targetPath=targetPath)