-
Notifications
You must be signed in to change notification settings - Fork 1
/
maze.py
129 lines (112 loc) · 4.78 KB
/
maze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# University of Pennsylvaina
# ESE650 Fall 2018
# Heejin Chloe Jeong
# Description:
# There are total 112 states defined by a position and a flag state, and four cardinal actions.
# A reward will be given as equivalent to the number of flags you have collected at the goal state
# (i.e. at the current state s, it performs an action a and observes a reward r and the next state s'.
# If s'=goal state, r=the number of flags it has collected. Otherwise, r=0 ).
# There are also six obstaces and the agent stays at the current state if it performs an action toward
# an obstacle or off the map. The agent slips with a probability 0.1 and reaches the next clockwise
# destination(i.e. It performed UP, but moved to its RIGHT).
import numpy as np
import random
ACTMAP = {0: 3, 1: 2, 2: 0, 3: 1}
color2num = dict(
gray=30,
red=31,
green=32,
yellow=33,
blue=34,
magenta=35,
cyan=36,
white=37,
crimson=38
)
class Maze():
# state ID : 0, ..., 111
# action ID : 0:UP, 1:DOWN, 2:LEFT, 3:RIGHT
obstacles = [(0, 1), (0, 3), (2, 0), (2, 4), (3, 2), (3, 4)]
def __init__(self):
self.episodic = True
self.stochastic = True
self.snum = 112
self.anum = 4
self.slip = 0.1
self.dim = (4, 5)
self.start_pos = (0, 0)
self.goal_pos = (0, 4)
self.goal = (96, 104)
self.map = np.asarray(["SWFWG", "OOOOO", "WOOOW", "FOWFW"])
self.img_map = np.ones(self.dim)
for x in Maze.obstacles:
self.img_map[x[0]][x[1]] = 0
self.idx2cell = {0: (0, 0), 1: (1, 0), 2: (3, 0), 3: (1, 1), 4: (2, 1), 5: (3, 1),
6: (0, 2), 7: (1, 2), 8: (2, 2), 9: (1, 3), 10: (2, 3), 11: (3, 3), 12: (0, 4), 13: (1, 4)}
self.cell2idx = {(1, 2): 7, (0, 0): 0, (3, 3): 11, (3, 0): 2, (3, 1): 5, (2, 1): 4,
(0, 2): 6, (1, 3): 9, (2, 3): 10, (1, 4): 13, (2, 2): 8, (0, 4): 12, (1, 0): 1, (1, 1): 3}
def step(self, state, action, slip = True):
# Input: the current state and action IDs
# Output: reward, the next state ID, done (episodic terminal boolean value)
if slip:
if np.random.rand() < self.slip:
action = ACTMAP[action]
cell = self.idx2cell[int(state / 8)]
if action == 0:
c_next = cell[1]
r_next = max(0, cell[0] - 1)
elif action == 1:
c_next = cell[1]
r_next = min(self.dim[0] - 1, cell[0] + 1)
elif action == 2:
c_next = max(0, cell[1] - 1)
r_next = cell[0]
elif action == 3:
c_next = min(self.dim[1] - 1, cell[1] + 1)
r_next = cell[0]
else:
print(action)
raise ValueError
if (r_next == self.goal_pos[0]) and (c_next == self.goal_pos[1]): # Reach the exit
v_flag = self.num2flag(state % 8)
return float(sum(v_flag)), 8 * self.cell2idx[(r_next, c_next)] + state % 8, True
else:
if (r_next, c_next) in Maze.obstacles: # obstacle tuple list
return 0.0, state, False
else: # Flag locations
v_flag = self.num2flag(state % 8)
if (r_next, c_next) == (0, 2):
v_flag[0] = 1
elif (r_next, c_next) == (3, 0):
v_flag[1] = 1
elif (r_next, c_next) == (3, 3):
v_flag[2] = 1
return 0.0, 8 * self.cell2idx[(r_next, c_next)] + self.flag2num(v_flag), False
def num2flag(self, n):
# n is a positive integer
# Each element of the below tuple correspond to a status of each flag. 0 for not collected, 1 for collected.
flaglist = [(0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (1, 0, 1), (0, 1, 1), (1, 1, 1)]
return list(flaglist[n])
def flag2num(self, v):
# v: list
if sum(v) < 2:
return np.inner(v, [1, 2, 3])
else:
return np.inner(v, [1, 2, 3]) + 1
def reset(self):
# Return the initial state
return 0
def plot(self, state, action):
cell = self.idx2cell[int(state / 8)]
desc = self.map.tolist()
desc[cell[0]] = desc[cell[0]][:cell[1]] + '\x1b[1;32m' + desc[cell[0]][cell[1]] + '\x1b[0m' + desc[cell[0]][
cell[1] + 1:]
print("action: ", ["UP", "DOWN", "LEFT", "RIGHT"][action] if action is not None else None)
print("\n".join("".join(row) for row in desc))
def colorize(string, color, bold=False, highlight = False):
attr = []
num = color2num[color]
if highlight: num += 10
attr.append(unicode(num))
if bold: attr.append('1')
return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)