From 992e96b8454c57e9a6589f1c9458e13ab4748826 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Tue, 11 Sep 2018 05:39:11 +0900 Subject: [PATCH 01/82] feat: add type field and change some conditions --- controller.py | 9 ++++----- isolating_controller/workload.py | 7 ++++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/controller.py b/controller.py index 929bff7..ba5b308 100755 --- a/controller.py +++ b/controller.py @@ -55,10 +55,10 @@ def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicP logger = logging.getLogger('monitoring.workload_creation') logger.debug(f'{arr} is received from workload_creation queue') - if len(arr) != 4: + if len(arr) != 5: return - wl_name, pid, perf_pid, perf_interval = arr + wl_name, wl_type, pid, perf_pid, perf_interval = arr pid = int(pid) perf_pid = int(perf_pid) perf_interval = int(perf_interval) @@ -66,10 +66,9 @@ def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicP if not psutil.pid_exists(pid): return - workload = Workload(wl_name, pid, perf_pid, perf_interval) + workload = Workload(wl_name, wl_type, pid, perf_pid, perf_interval) - # FIXME: hard coded - if wl_name == 'SP': + if wl_type == 'bg': self._pending_wl.add_bg(workload) else: self._pending_wl.add_fg(workload) diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index edcec88..872ebed 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -20,8 +20,9 @@ class Workload: ControlThread schedules the groups of `Workload' instances to enforce their scheduling decisions """ - def __init__(self, name: str, pid: int, perf_pid: int, perf_interval: int) -> None: + def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interval: int) -> None: self._name = name + self._wl_type = wl_type self._pid = pid self._metrics: Deque[BasicMetric] = deque() self._perf_pid = perf_pid @@ -40,6 +41,10 @@ def name(self) -> str: def pid(self) -> int: return self._pid + @property + def wl_type(self) -> str: + return self._wl_type + @property def metrics(self) -> Deque[BasicMetric]: return self._metrics From 3479f4845774174d99896ef1a62317954e3bab57 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Tue, 18 Sep 2018 04:09:08 +0900 Subject: [PATCH 02/82] Feat: Modify the pending_queue to treat more than two workloads --- .../isolation/policies/base_policy.py | 3 +- isolating_controller/utils/numa_topology.py | 54 +++++++++++++ isolating_controller/workload.py | 13 ++- pending_queue.py | 79 +++++++++++++------ 4 files changed, 125 insertions(+), 24 deletions(-) create mode 100644 isolating_controller/utils/numa_topology.py diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index f72fa9c..42336ee 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -17,9 +17,10 @@ class ResourceType(IntEnum): class IsolationPolicy(metaclass=ABCMeta): _IDLE_ISOLATOR: IdleIsolator = IdleIsolator() - def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: + def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: self._fg_wl = fg_wl self._bg_wl = bg_wl + self._skt_id = skt_id self._isolator_map: Mapping[Type[Isolator], Isolator] = dict() self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR diff --git a/isolating_controller/utils/numa_topology.py b/isolating_controller/utils/numa_topology.py new file mode 100644 index 0000000..901738d --- /dev/null +++ b/isolating_controller/utils/numa_topology.py @@ -0,0 +1,54 @@ +# coding: UTF-8 + +from pathlib import Path +from typing import Dict, Set, Tuple + +import aiofiles + +from .hyphen import convert_to_set + + +class NumaTopology: + BASE_PATH: Path = Path('/sys/devices/system/node') + + @staticmethod + async def get_node_topo() -> Set[int]: + online_path: Path = NumaTopology.BASE_PATH / 'online' + + async with aiofiles.open(online_path) as fp: + line: str = await fp.readline() + node_list = convert_to_set(line) + + return node_list + + @staticmethod + async def get_cpu_topo(node_list: Set[int]) -> Dict[int, Set[int]]: + cpu_topo: Dict[int, Set[int]] = dict() + + for num in node_list: + cpulist_path: Path = NumaTopology.BASE_PATH / f'node{num}/cpulist' + + async with aiofiles.open(cpulist_path) as fp: + line: str = await fp.readline() + cpu_topo[num] = convert_to_set(line) + + return cpu_topo + + @staticmethod + async def get_mem_topo() -> Set[int]: + has_memory_path = NumaTopology.BASE_PATH / 'has_memory' + + async with aiofiles.open(has_memory_path) as fp: + line: str = await fp.readline() + mem_topo = convert_to_set(line) + + # TODO: get_mem_topo can be enhanced by using real numa memory access latency + + return mem_topo + + @staticmethod + async def get_numa_info() -> Tuple[Dict[int, Set[int]], Set[int]]: + node_list = await NumaTopology.get_node_topo() + cpu_topo = await NumaTopology.get_cpu_topo(node_list) + mem_topo = await NumaTopology.get_mem_topo() + return cpu_topo, mem_topo diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 872ebed..7f58087 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -2,14 +2,16 @@ from collections import deque from itertools import chain -from typing import Deque, Tuple +from typing import Deque, Tuple, Dict, Set import cpuinfo import psutil +from .utils.numa_topology import NumaTopology from .metric_container.basic_metric import BasicMetric, MetricDiff from .solorun_data.datas import data_map + L3_SIZE = int(cpuinfo.get_cpu_info()['l3_cache_size'].split()[0]) * 1024 @@ -79,3 +81,12 @@ def all_child_tid(self) -> Tuple[int, ...]: )) except psutil.NoSuchProcess: return tuple() + + def get_socket_id(self): + cpuset = self.cpuset + cpu_topo, _ = await NumaTopology.get_numa_info() + + # FIXME: Hardcode for assumption (one workload to one socket) + for socket_id, skt_cpus in cpu_topo.items(): + if cpuset in skt_cpus: + return socket_id diff --git a/pending_queue.py b/pending_queue.py index e5dbdcf..b126ee3 100644 --- a/pending_queue.py +++ b/pending_queue.py @@ -5,14 +5,16 @@ from isolating_controller.isolation.policies import IsolationPolicy from isolating_controller.workload import Workload - +from .isolating_controller.utils.numa_topology import NumaTopology class PendingQueue(Sized): - def __init__(self, policy_type: Type[IsolationPolicy]) -> None: + def __init__(self, policy_type: Type[IsolationPolicy], max_pending: int) -> None: self._policy_type: Type[IsolationPolicy] = policy_type + self._max_pending: int = max_pending - self._bg_q: Dict[Tuple[int, ...], Workload] = dict() - self._fg_q: Dict[Tuple[int, ...], Workload] = dict() + self._cur_pending: int = 0 + self._bg_q: Dict[int, Workload] = dict() + self._fg_q: Dict[int, Workload] = dict() self._pending_list: List[IsolationPolicy] = list() def __len__(self) -> int: @@ -24,33 +26,66 @@ def add_bg(self, workload: Workload) -> None: logger = logging.getLogger(__name__) logger.info(f'{workload} is ready for active as Background') - # FIXME: hard coded - other_cpuset = tuple(map(lambda x: x - 8, workload.cpuset)) - - if other_cpuset in self._fg_q: - new_group = self._policy_type(self._fg_q[other_cpuset], workload) - self._pending_list.append(new_group) - del self._fg_q[other_cpuset] - + if self._cur_pending < self._max_pending: + self._bg_q[workload.pid] = workload + self._cur_pending += 1 else: - self._bg_q[workload.cpuset] = workload + self.dump_to_pending_list() def add_fg(self, workload: Workload) -> None: logger = logging.getLogger(__name__) logger.info(f'{workload} is ready for active as Foreground') - # FIXME: hard coded - other_cpuset = tuple(map(lambda x: x + 8, workload.cpuset)) - - if other_cpuset in self._bg_q: - new_group = self._policy_type(self._bg_q[other_cpuset], workload) - self._pending_list.append(new_group) - del self._bg_q[other_cpuset] - + if self._cur_pending < self._max_pending: + self._fg_q[workload.pid] = workload + self._cur_pending += 1 else: - self._fg_q[workload.cpuset] = workload + self.dump_to_pending_list() def pop(self) -> IsolationPolicy: if len(self) is 0: raise IndexError(f'{self} is empty') return self._pending_list.pop() + + def dump_to_pending_list(self) -> None: + fg_pids = list(self._fg_q.keys()) + bg_pids = list(self._bg_q.keys()) + all_pids = list() + for i in range(len(self._fg_q)): + all_pids.append(fg_pids[i]) + for i in range(len(self._bg_q)): + all_pids.append(bg_pids[i]) + + node_list = await NumaTopology.get_node_topo() + group_pids = dict() # Dict. for grouping the fg and bg + for node in node_list: + group_pids[node] = set() + + for pid in all_pids: + if pid in fg_pids: + skt_id = self._fg_q[pid].get_socket_id() + group_pids[skt_id].add(pid) + elif pid in bg_pids: + skt_id = self._bg_q[pid].get_socket_id() + group_pids[skt_id].add(pid) + + # Grouping pids based on their types and skt_id + for node in node_list: + node_pidset = group_pids[node] + pid = node_pidset.pop() + if pid in fg_pids: + bg_pid = node_pidset.pop() + new_group = self._policy_type(pid, bg_pid, node) + self._pending_list.append(new_group) + del self._fg_q[pid] + del self._bg_q[bg_pid] + elif pid in bg_pids: + fg_pid = node_pidset.pop() + new_group = self._policy_type(fg_pid, pid, node) + self._pending_list.append(new_group) + del self._fg_q[fg_pid] + del self._bg_q[pid] + return + + def update_max_pending(self, new_max_pending: int): + self._max_pending = new_max_pending From 702507fba711a59f3d7bc1be9d6621a4f3b548ad Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Tue, 18 Sep 2018 14:39:15 +0900 Subject: [PATCH 03/82] fix: fix NumaTopology from async to sync --- isolating_controller/utils/numa_topology.py | 26 ++++++++++----------- pending_queue.py | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/isolating_controller/utils/numa_topology.py b/isolating_controller/utils/numa_topology.py index 901738d..18c998f 100644 --- a/isolating_controller/utils/numa_topology.py +++ b/isolating_controller/utils/numa_topology.py @@ -12,34 +12,34 @@ class NumaTopology: BASE_PATH: Path = Path('/sys/devices/system/node') @staticmethod - async def get_node_topo() -> Set[int]: + def get_node_topo() -> Set[int]: online_path: Path = NumaTopology.BASE_PATH / 'online' - async with aiofiles.open(online_path) as fp: - line: str = await fp.readline() + with open(online_path) as fp: + line: str = fp.readline() node_list = convert_to_set(line) return node_list @staticmethod - async def get_cpu_topo(node_list: Set[int]) -> Dict[int, Set[int]]: + def get_cpu_topo(node_list: Set[int]) -> Dict[int, Set[int]]: cpu_topo: Dict[int, Set[int]] = dict() for num in node_list: cpulist_path: Path = NumaTopology.BASE_PATH / f'node{num}/cpulist' - async with aiofiles.open(cpulist_path) as fp: - line: str = await fp.readline() + with open(cpulist_path) as fp: + line: str = fp.readline() cpu_topo[num] = convert_to_set(line) return cpu_topo @staticmethod - async def get_mem_topo() -> Set[int]: + def get_mem_topo() -> Set[int]: has_memory_path = NumaTopology.BASE_PATH / 'has_memory' - async with aiofiles.open(has_memory_path) as fp: - line: str = await fp.readline() + with open(has_memory_path) as fp: + line: str = fp.readline() mem_topo = convert_to_set(line) # TODO: get_mem_topo can be enhanced by using real numa memory access latency @@ -47,8 +47,8 @@ async def get_mem_topo() -> Set[int]: return mem_topo @staticmethod - async def get_numa_info() -> Tuple[Dict[int, Set[int]], Set[int]]: - node_list = await NumaTopology.get_node_topo() - cpu_topo = await NumaTopology.get_cpu_topo(node_list) - mem_topo = await NumaTopology.get_mem_topo() + def get_numa_info() -> Tuple[Dict[int, Set[int]], Set[int]]: + node_list = NumaTopology.get_node_topo() + cpu_topo = NumaTopology.get_cpu_topo(node_list) + mem_topo = NumaTopology.get_mem_topo() return cpu_topo, mem_topo diff --git a/pending_queue.py b/pending_queue.py index b126ee3..404381b 100644 --- a/pending_queue.py +++ b/pending_queue.py @@ -56,8 +56,8 @@ def dump_to_pending_list(self) -> None: for i in range(len(self._bg_q)): all_pids.append(bg_pids[i]) - node_list = await NumaTopology.get_node_topo() - group_pids = dict() # Dict. for grouping the fg and bg + node_list = NumaTopology.get_node_topo() + group_pids = dict() # Dict. for grouping the fg and bg for node in node_list: group_pids[node] = set() From ee6a735c333bb3c2f5dd580a13858b675b271645 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Wed, 19 Sep 2018 15:25:34 +0900 Subject: [PATCH 04/82] fix: Fix grouping code in pending_queue.py --- controller.py | 8 +++-- .../isolation/isolators/base_isolator.py | 8 ++--- .../isolation/policies/diff_policy.py | 4 +-- .../policies/diff_with_violation_policy.py | 4 +-- .../isolation/policies/greedy_diff_policy.py | 4 +-- .../greedy_diff_with_violation_policy.py | 4 +-- isolating_controller/utils/hyphen.py | 22 ++++++++++++ isolating_controller/utils/numa_topology.py | 8 ++--- isolating_controller/workload.py | 16 +++++---- pending_queue.py | 35 ++++++++++++------- 10 files changed, 76 insertions(+), 37 deletions(-) create mode 100644 isolating_controller/utils/hyphen.py diff --git a/controller.py b/controller.py index 6ca9969..ca39610 100755 --- a/controller.py +++ b/controller.py @@ -24,6 +24,7 @@ from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue +from threading import RLock MIN_PYTHON = (3, 6) @@ -44,8 +45,10 @@ def __init__(self, metric_buf_size: int) -> None: self._rmq_host = 'localhost' self._rmq_creation_queue = 'workload_creation' - self._pending_wl = PendingQueue(DiffPolicy) + ## FIXME : Hard coded - PendingQueue can have four workloads at most (second argument) + self._pending_wl = PendingQueue(DiffPolicy, 4) self._control_thread = ControlThread(self._pending_wl) + self._lock = RLock() def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: ch.basic_ack(method.delivery_tag) @@ -67,10 +70,11 @@ def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicP return workload = Workload(wl_name, wl_type, pid, perf_pid, perf_interval) - if wl_type == 'bg': + logger.info(f'{workload} is background process') self._pending_wl.add_bg(workload) else: + logger.info(f'{workload} is foreground process') self._pending_wl.add_fg(workload) logger.info(f'{workload} is created') diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index 51a3129..264696c 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -14,7 +14,7 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._foreground_wl = foreground_wl self._background_wl = background_wl - self._is_fist_decision: bool = True + self._is_first_decision: bool = True @abstractmethod def strengthen(self) -> 'Isolator': @@ -63,7 +63,7 @@ def yield_isolation(self) -> None: Declare to stop the configuration search for the current isolator. Must be called when the current isolator yields the initiative. """ - self._is_fist_decision = True + self._is_first_decision = True @abstractmethod def _first_decision(self) -> NextStep: @@ -74,8 +74,8 @@ def _monitoring_result(self) -> NextStep: pass def decide_next_step(self) -> NextStep: - if self._is_fist_decision: - self._is_fist_decision = False + if self._is_first_decision: + self._is_first_decision = False return self._first_decision() else: diff --git a/isolating_controller/isolation/policies/diff_policy.py b/isolating_controller/isolation/policies/diff_policy.py index 56975d9..5757a48 100644 --- a/isolating_controller/isolation/policies/diff_policy.py +++ b/isolating_controller/isolation/policies/diff_policy.py @@ -8,8 +8,8 @@ class DiffPolicy(IsolationPolicy): - def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: - super().__init__(fg_wl, bg_wl) + def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: + super().__init__(fg_wl, bg_wl, skt_id) self._is_llc_isolated = False self._is_mem_isolated = False diff --git a/isolating_controller/isolation/policies/diff_with_violation_policy.py b/isolating_controller/isolation/policies/diff_with_violation_policy.py index 6b457a4..db58386 100644 --- a/isolating_controller/isolation/policies/diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/diff_with_violation_policy.py @@ -11,8 +11,8 @@ class DiffWViolationPolicy(DiffPolicy): VIOLATION_THRESHOLD = 3 - def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: - super().__init__(fg_wl, bg_wl) + def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: + super().__init__(fg_wl, bg_wl, skt_id) self._violation_count: int = 0 diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 4cd1fad..d2b0fc9 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -8,8 +8,8 @@ class GreedyDiffPolicy(IsolationPolicy): - def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: - super().__init__(fg_wl, bg_wl) + def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: + super().__init__(fg_wl, bg_wl, skt_id) self._is_mem_isolated = False diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index a10ef8b..84d41ff 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -11,8 +11,8 @@ class GreedyDiffWViolationPolicy(GreedyDiffPolicy): VIOLATION_THRESHOLD = 3 - def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: - super().__init__(fg_wl, bg_wl) + def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: + super().__init__(fg_wl, bg_wl, skt_id) self._violation_count: int = 0 diff --git a/isolating_controller/utils/hyphen.py b/isolating_controller/utils/hyphen.py new file mode 100644 index 0000000..0ac117e --- /dev/null +++ b/isolating_controller/utils/hyphen.py @@ -0,0 +1,22 @@ +# coding: UTF-8 + +from typing import Iterable, Set + + +def convert_to_set(hyphen_str: str) -> Set[int]: + ret = set() + + for elem in hyphen_str.split(','): + group = tuple(map(int, elem.split('-'))) + + if len(group) is 1: + ret.add(group[0]) + elif len(group) is 2: + ret.update(range(group[0], group[1] + 1)) + + return ret + + +def convert_to_hyphen(core_ids: Iterable[int]) -> str: + # TODO + return ','.join(map(str, set(core_ids))) diff --git a/isolating_controller/utils/numa_topology.py b/isolating_controller/utils/numa_topology.py index 18c998f..cf78890 100644 --- a/isolating_controller/utils/numa_topology.py +++ b/isolating_controller/utils/numa_topology.py @@ -3,8 +3,6 @@ from pathlib import Path from typing import Dict, Set, Tuple -import aiofiles - from .hyphen import convert_to_set @@ -15,7 +13,7 @@ class NumaTopology: def get_node_topo() -> Set[int]: online_path: Path = NumaTopology.BASE_PATH / 'online' - with open(online_path) as fp: + with open(online_path, "r") as fp: line: str = fp.readline() node_list = convert_to_set(line) @@ -28,7 +26,7 @@ def get_cpu_topo(node_list: Set[int]) -> Dict[int, Set[int]]: for num in node_list: cpulist_path: Path = NumaTopology.BASE_PATH / f'node{num}/cpulist' - with open(cpulist_path) as fp: + with open(cpulist_path, "r") as fp: line: str = fp.readline() cpu_topo[num] = convert_to_set(line) @@ -38,7 +36,7 @@ def get_cpu_topo(node_list: Set[int]) -> Dict[int, Set[int]]: def get_mem_topo() -> Set[int]: has_memory_path = NumaTopology.BASE_PATH / 'has_memory' - with open(has_memory_path) as fp: + with open(has_memory_path, "r") as fp: line: str = fp.readline() mem_topo = convert_to_set(line) diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 7f58087..f333a28 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -2,7 +2,7 @@ from collections import deque from itertools import chain -from typing import Deque, Tuple, Dict, Set +from typing import Deque, Tuple, Set import cpuinfo import psutil @@ -82,11 +82,15 @@ def all_child_tid(self) -> Tuple[int, ...]: except psutil.NoSuchProcess: return tuple() - def get_socket_id(self): - cpuset = self.cpuset - cpu_topo, _ = await NumaTopology.get_numa_info() + def get_socket_id(self) -> int: + cpuset: Set[int] = self.cpuset + cpu_topo, _ = NumaTopology.get_numa_info() + ret = None # FIXME: Hardcode for assumption (one workload to one socket) for socket_id, skt_cpus in cpu_topo.items(): - if cpuset in skt_cpus: - return socket_id + print(f'cpuset: {cpuset}, socket_id: {socket_id}, skt_cpus: {skt_cpus}') + for cpu_id in cpuset: + if cpu_id in skt_cpus: + ret = socket_id + return ret diff --git a/pending_queue.py b/pending_queue.py index 404381b..877d975 100644 --- a/pending_queue.py +++ b/pending_queue.py @@ -1,11 +1,13 @@ # coding: UTF-8 import logging -from typing import Dict, List, Sized, Tuple, Type +from threading import RLock + +from typing import Dict, List, Sized, Type from isolating_controller.isolation.policies import IsolationPolicy from isolating_controller.workload import Workload -from .isolating_controller.utils.numa_topology import NumaTopology +from isolating_controller.utils.numa_topology import NumaTopology class PendingQueue(Sized): def __init__(self, policy_type: Type[IsolationPolicy], max_pending: int) -> None: @@ -25,21 +27,22 @@ def __len__(self) -> int: def add_bg(self, workload: Workload) -> None: logger = logging.getLogger(__name__) logger.info(f'{workload} is ready for active as Background') + logger.info(f'self._cur_pending: {self._cur_pending}') - if self._cur_pending < self._max_pending: - self._bg_q[workload.pid] = workload - self._cur_pending += 1 - else: + self._bg_q[workload.pid] = workload + self._cur_pending += 1 + if self._cur_pending == self._max_pending: self.dump_to_pending_list() + def add_fg(self, workload: Workload) -> None: logger = logging.getLogger(__name__) logger.info(f'{workload} is ready for active as Foreground') + logger.info(f'self._cur_pending: {self._cur_pending}') - if self._cur_pending < self._max_pending: - self._fg_q[workload.pid] = workload - self._cur_pending += 1 - else: + self._fg_q[workload.pid] = workload + self._cur_pending += 1 + if self._cur_pending == self._max_pending: self.dump_to_pending_list() def pop(self) -> IsolationPolicy: @@ -48,6 +51,9 @@ def pop(self) -> IsolationPolicy: return self._pending_list.pop() def dump_to_pending_list(self) -> None: + logger = logging.getLogger(__name__) + logger.info('Dumping workloads to pending list!') + fg_pids = list(self._fg_q.keys()) bg_pids = list(self._bg_q.keys()) all_pids = list() @@ -69,19 +75,24 @@ def dump_to_pending_list(self) -> None: skt_id = self._bg_q[pid].get_socket_id() group_pids[skt_id].add(pid) + logger.info('Trying to create new groups!') + # # Grouping pids based on their types and skt_id for node in node_list: node_pidset = group_pids[node] pid = node_pidset.pop() + print(f'Pop {pid}!') if pid in fg_pids: bg_pid = node_pidset.pop() - new_group = self._policy_type(pid, bg_pid, node) + print(f'Pop {bg_pid}!') + new_group = self._policy_type(self._fg_q[pid], self._bg_q[bg_pid], node) self._pending_list.append(new_group) del self._fg_q[pid] del self._bg_q[bg_pid] elif pid in bg_pids: fg_pid = node_pidset.pop() - new_group = self._policy_type(fg_pid, pid, node) + print(f'Pop {fg_pid}!') + new_group = self._policy_type(self._fg_q[fg_pid], self._bg_q[pid], node) self._pending_list.append(new_group) del self._fg_q[fg_pid] del self._bg_q[pid] From 75f8bfb26e8199b896506be9e3ec5ff5d87be23d Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Wed, 19 Sep 2018 20:56:12 +0900 Subject: [PATCH 05/82] feat: Add SwapIsolator skeleton code --- .../isolation/isolators/swap.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 isolating_controller/isolation/isolators/swap.py diff --git a/isolating_controller/isolation/isolators/swap.py b/isolating_controller/isolation/isolators/swap.py new file mode 100644 index 0000000..3be9aca --- /dev/null +++ b/isolating_controller/isolation/isolators/swap.py @@ -0,0 +1,88 @@ +# coding: UTF-8 + +import logging + +from typing import Dict, Set + +from .base_isolator import Isolator +from .. import NextStep +from ...workload import Workload +from ..policies import IsolationPolicy + +class SwapIsolator(Isolator): + _THRESHOLD = 0.005 + + def __init__(self, foreground_wl: Workload, background_wl: Workload, + isolation_groups: Dict[IsolationPolicy, int]) -> None: + super().__init__(foreground_wl, background_wl) + + self._all_groups = isolation_groups + self._swap_candidates: Set[Workload] = None + + def __del__(self): + logger = logging.getLogger(__name__) + if self._foreground_wl.is_running: + logger.debug(f'reset swap configuration of {self._foreground_wl}') + + if self._background_wl.is_running: + logger.debug(f'reset swap configuration of {self._background_wl}') + + + def strengthen(self) -> 'Isolator': + """ + Choosing which contentious workloads to swap out to other socket + :return: + """ + # FIXME: hard coded (two sockets) + ## Estimating the socket contention + ## + + return + + @property + def is_max_level(self) -> bool: + """ + Searching configuration space to the max level + e.g., There is no searchable candidate to strengthen the degree of isolation + :return: + """ + # FIXME: + + return False + + @property + def is_min_level(self) -> bool: + """ + Searching configuration space to the min level + e.g., There is no searchable candidate to weaken the degree of isolation + :return: + """ + # FIXME: + + return False + + def weaken(self) -> 'Isolator': + """ + Choosing which contentious workloads to swap in from other socket + :return: + """ + # FIXME: hard coded (two sockets) + pass + + def _enforce(self) -> None: + """ + Enforcing the pre-configured swap isolation + :return: + """ + pass + +# def enforce(self) -> None: +# self._prev_metric_diff: MetricDiff = self._foreground_wl.calc_metric_diff() +# +# self._enforce() + + def _first_decision(self) -> NextStep: + pass + + def _monitoring_result(self) -> NextStep: + pass From a09f1d4f58e121a9a9ffd1b843fe4473a8f3cbfa Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Thu, 20 Sep 2018 21:35:30 +0900 Subject: [PATCH 06/82] feat: Add cgroup cycle throttle and create cgroup.py based on cgroup_cpuset.py --- isolating_controller/utils/cgroup.py | 61 ++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 isolating_controller/utils/cgroup.py diff --git a/isolating_controller/utils/cgroup.py b/isolating_controller/utils/cgroup.py new file mode 100644 index 0000000..2f690fd --- /dev/null +++ b/isolating_controller/utils/cgroup.py @@ -0,0 +1,61 @@ +# coding: UTF-8 + + +import subprocess +import getpass +import grp +import os + +from typing import Iterable, Set, Optional +from .hyphen import convert_to_set + + +class Cgroup: + CPUSET_MOUNT_POINT = '/sys/fs/cgroup/cpuset' + CPU_MOUNT_POINT = '/sys/fs/cgroup/cpu' + + def __init__(self, group_name: str, controllers: str) -> None: + self._group_name: str = group_name + self._controllers: str = controllers + self._group_path: str = f'{controllers}:{group_name}' + + def create_group(self) -> None: + uname: str = getpass.getuser() + gid: int = os.getegid() + gname: str = grp.getgrgid(gid).gr_name + + subprocess.check_call(args=( + 'sudo', 'cgcreate', '-a', f'{uname}:{gname}', '-d', '700', '-f', + '600', '-t', f'{uname}:{gname}', '-s', '600', '-g', self._group_path)) + + def assign_cpus(self, core_set: Set[int]) -> None: + core_ids = ','.join(map(str, core_set)) + subprocess.check_call(args=('cgset', '-r', f'cpuset.cpus={core_ids}', self._group_name)) + + def assign_mems(self, socket_set: Set[int]) -> None: + mem_ids = ','.join(map(str, socket_set)) + subprocess.check_call(args=('cgset', '-r', f'cpuset.mems={mem_ids}', self._group_name)) + + def _get_cpu_affinity_from_group(self) -> Set[int]: + with open(f'{Cgroup.CPUSET_MOUNT_POINT}/{self._group_name}/cpuset.cpus', "r") as fp: + line: str = fp.readline() + core_set: Set[int] = convert_to_set(line) + return core_set + + def limit_cpu_quota(self, limit_percentage: float, period: Optional[int]=None) -> None: + if period is None: + with open(f'{Cgroup.CPU_MOUNT_POINT}/cpu.cfs_period_us', "r") as fp: + line: str = fp.readline() + period = int(line) + + cpu_cores = self._get_cpu_affinity_from_group() + quota = int(period * limit_percentage/100 * len(cpu_cores)) + subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_quota_us={quota}', self._group_name)) + + subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_period_us={period}', self._group_name)) + + def add_tasks(self, pids: Iterable[int]) -> None: + subprocess.check_call(args=('cgclassify', '-g', self._group_path, '--sticky', *map(str, pids))) + + def delete(self) -> None: + subprocess.check_call(args=('sudo', 'cgdelete', '-r', '-g', self._group_path)) From 063e25b02cc9c193f38b4c60e7fecdc2cdb8d1fb Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Thu, 20 Sep 2018 21:36:50 +0900 Subject: [PATCH 07/82] feat: Fix SchedIsolator (strengthen, weaken, properties) --- .../isolation/isolators/schedule.py | 89 ++++++++++++++----- isolating_controller/utils/__init__.py | 2 + isolating_controller/workload.py | 8 +- 3 files changed, 77 insertions(+), 22 deletions(-) diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index 78ae378..8fbe218 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -2,11 +2,15 @@ import logging +from typing import Tuple, Set, Dict + from .base_isolator import Isolator from .. import NextStep -from ...utils import CgroupCpuset +#from ...utils import CgroupCpuset from ...workload import Workload - +from ...utils import Cgroup +from ...utils import NumaTopology +from ...utils import hyphen class SchedIsolator(Isolator): _DOD_THRESHOLD = 0.005 @@ -15,46 +19,87 @@ class SchedIsolator(Isolator): def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) - # FIXME: hard coded - self._cur_step = 24 + self._fg_cpuset: Tuple[int] = foreground_wl.cpuset + self._bg_cpuset: Tuple[int] = background_wl.cpuset + self._cur_bg_step: int = min(self._bg_cpuset) + self._cur_fg_step: int = max(self._fg_cpuset) - self._bg_grp_name = f'{background_wl.name}_{background_wl.pid}' - self._prev_bg_affinity = background_wl.cpuset + self._fg_next_step = NextStep.IDLE + self._bg_next_step = NextStep.IDLE - CgroupCpuset.create_group(self._bg_grp_name) - CgroupCpuset.add_task(self._bg_grp_name, background_wl.pid) - # FIXME: hard coded - CgroupCpuset.assign(self._bg_grp_name, set(range(self._cur_step, 32))) + self._bg_grp_name: str = f'{background_wl.name}_{background_wl.pid}' + self._prev_bg_affinity: Tuple[int] = background_wl.cpuset + self._cgroup = Cgroup(self._bg_grp_name, 'cpuset,cpu') + + cpu_topo, mem_topo = NumaTopology.get_numa_info() + self._cpu_topo: Dict[int, Set[int]] = cpu_topo + self._mem_topo: Set[int] = mem_topo def __del__(self) -> None: if self._background_wl.is_running: - CgroupCpuset.assign(self._bg_grp_name, set(self._prev_bg_affinity)) + self._cgroup.assign_cpus(set(self._prev_bg_affinity)) def strengthen(self) -> 'SchedIsolator': - self._cur_step += 1 + """ + Strengthen reduces the number of CPUs assigned to BG workloads and increase that of FG workload + TODO: Changing Step Size if needed + :return: + """ + # NOTE: Caller is assumed that BG workload + if self._bg_next_step == NextStep.STRENGTHEN: + self._cur_bg_step += 1 + bg_cpuset = set(self._bg_cpuset) + bg_cpuset.remove(self._cur_bg_step) + self._bg_cpuset = tuple(bg_cpuset) + if self._fg_next_step == NextStep.WEAKEN: + self._cur_fg_step += 1 + fg_cpuset = set(self._fg_cpuset) + fg_cpuset.add(self._cur_fg_step) + self._fg_cpuset = tuple(fg_cpuset) return self def weaken(self) -> 'SchedIsolator': - self._cur_step -= 1 + """ + Weaken increase the number of CPUs assigned to BG workloads and decrease that of FG workload + TODO: Changing Step Size if needed + :return: + """ + # NOTE: Caller is assumed that BG workload + if self._bg_next_step == NextStep.WEAKEN: + self._cur_bg_step -= 1 + bg_cpuset = set(self._bg_cpuset) + bg_cpuset.add(self._cur_bg_step) + self._bg_cpuset = tuple(bg_cpuset) + if self._fg_next_step == NextStep.STRENGTHEN: + self._cur_fg_step -= 1 + fg_cpuset = set(self._fg_cpuset) + fg_cpuset.remove(self._cur_fg_step) + self._fg_cpuset = tuple(fg_cpuset) return self @property def is_max_level(self) -> bool: - # FIXME: hard coded - return self._cur_step == 31 + # FIXME: How about first condition is true but the other is false? + if self._bg_next_step == NextStep.STRENGTHEN: + return self._cur_bg_step == max(self._cpu_topo[self._background_wl.socket_id]) + if self._fg_next_step == NextStep.WEAKEN: + return self._cur_fg_step == self._cur_bg_step-1 @property def is_min_level(self) -> bool: - # FIXME: hard coded - return self._cur_step == 24 + # FIXME: How about first condition is true but the other is false? + if self._bg_next_step == NextStep.WEAKEN: + return self._cur_bg_step == self._cur_fg_step+1 + if self._fg_next_step == NextStep.STRENGTHEN: + return self._cur_fg_step == min(self._cpu_topo[self._foreground_wl.socket_id]) def _enforce(self) -> None: logger = logging.getLogger(__name__) - # FIXME: hard coded - logger.info(f'affinity of background is {self._cur_step}-31') + logger.info(f'affinity of background is {hyphen.convert_to_hyphen(self._bg_cpuset)}') - # FIXME: hard coded - CgroupCpuset.assign(self._bg_grp_name, set(range(self._cur_step, 32))) + # FIXME: Only changing the number of CPUs of BG process + self._cgroup.assign_cpus(set(self._bg_cpuset)) + self._cgroup.assign_cpus(set(self._fg_cpuset)) def _first_decision(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() @@ -94,7 +139,9 @@ def _monitoring_result(self) -> NextStep: return NextStep.STOP elif curr_diff > 0: + self._bg_next_step = NextStep.WEAKEN return NextStep.WEAKEN else: + self._bg_next_step = NextStep.STRENGTHEN return NextStep.STRENGTHEN diff --git a/isolating_controller/utils/__init__.py b/isolating_controller/utils/__init__.py index 37479ca..ce410ef 100644 --- a/isolating_controller/utils/__init__.py +++ b/isolating_controller/utils/__init__.py @@ -2,4 +2,6 @@ from .cat import CAT from .cgroup_cpuset import CgroupCpuset +from .cgroup import Cgroup from .dvfs import DVFS +from .numa_topology import NumaTopology diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index f333a28..46f614c 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -31,6 +31,7 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._perf_interval = perf_interval self._proc_info = psutil.Process(pid) + self._socket_id = None def __repr__(self) -> str: return f'{self._name} (pid: {self._pid})' @@ -51,6 +52,11 @@ def wl_type(self) -> str: def metrics(self) -> Deque[BasicMetric]: return self._metrics + @property + def socket_id(self) -> int: + self._socket_id = self.get_socket_id() + return self._socket_id + @property def cpuset(self) -> Tuple[int, ...]: return tuple(self._proc_info.cpu_affinity()) @@ -85,7 +91,6 @@ def all_child_tid(self) -> Tuple[int, ...]: def get_socket_id(self) -> int: cpuset: Set[int] = self.cpuset cpu_topo, _ = NumaTopology.get_numa_info() - ret = None # FIXME: Hardcode for assumption (one workload to one socket) for socket_id, skt_cpus in cpu_topo.items(): @@ -93,4 +98,5 @@ def get_socket_id(self) -> int: for cpu_id in cpuset: if cpu_id in skt_cpus: ret = socket_id + self._socket_id = ret return ret From 9bef9471d4025d32c6fc1386d12b5f1a8e9f0277 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Fri, 21 Sep 2018 11:00:18 +0900 Subject: [PATCH 08/82] fix: Rename to CoreIsolator and add fg_next_step code --- .../isolation/isolators/__init__.py | 2 +- .../isolators/{schedule.py => core.py} | 45 ++++++++++++++----- .../isolation/policies/base_policy.py | 4 +- .../isolation/policies/diff_policy.py | 4 +- .../policies/diff_with_violation_policy.py | 4 +- .../isolation/policies/greedy_diff_policy.py | 4 +- .../greedy_diff_with_violation_policy.py | 4 +- 7 files changed, 44 insertions(+), 23 deletions(-) rename isolating_controller/isolation/isolators/{schedule.py => core.py} (74%) diff --git a/isolating_controller/isolation/isolators/__init__.py b/isolating_controller/isolation/isolators/__init__.py index b0084b8..634a419 100644 --- a/isolating_controller/isolation/isolators/__init__.py +++ b/isolating_controller/isolation/isolators/__init__.py @@ -5,4 +5,4 @@ from .cache import CacheIsolator from .idle import IdleIsolator from .memory import MemoryIsolator -from .schedule import SchedIsolator +from .core import CoreIsolator diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/core.py similarity index 74% rename from isolating_controller/isolation/isolators/schedule.py rename to isolating_controller/isolation/isolators/core.py index 8fbe218..034a04a 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/core.py @@ -6,13 +6,12 @@ from .base_isolator import Isolator from .. import NextStep -#from ...utils import CgroupCpuset from ...workload import Workload from ...utils import Cgroup from ...utils import NumaTopology from ...utils import hyphen -class SchedIsolator(Isolator): +class CoreIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 @@ -39,10 +38,10 @@ def __del__(self) -> None: if self._background_wl.is_running: self._cgroup.assign_cpus(set(self._prev_bg_affinity)) - def strengthen(self) -> 'SchedIsolator': + def strengthen(self) -> 'CoreIsolator': """ Strengthen reduces the number of CPUs assigned to BG workloads and increase that of FG workload - TODO: Changing Step Size if needed + TODO: Changing step size, if needed :return: """ # NOTE: Caller is assumed that BG workload @@ -58,10 +57,10 @@ def strengthen(self) -> 'SchedIsolator': self._fg_cpuset = tuple(fg_cpuset) return self - def weaken(self) -> 'SchedIsolator': + def weaken(self) -> 'CoreIsolator': """ Weaken increase the number of CPUs assigned to BG workloads and decrease that of FG workload - TODO: Changing Step Size if needed + TODO: Changing step size, if needed :return: """ # NOTE: Caller is assumed that BG workload @@ -96,8 +95,8 @@ def is_min_level(self) -> bool: def _enforce(self) -> None: logger = logging.getLogger(__name__) logger.info(f'affinity of background is {hyphen.convert_to_hyphen(self._bg_cpuset)}') + logger.info(f'affinity of foreground is {hyphen.convert_to_hyphen(self._fg_cpuset)}') - # FIXME: Only changing the number of CPUs of BG process self._cgroup.assign_cpus(set(self._bg_cpuset)) self._cgroup.assign_cpus(set(self._fg_cpuset)) @@ -108,17 +107,29 @@ def _first_decision(self) -> NextStep: logger = logging.getLogger(__name__) logger.debug(f'current diff: {curr_diff:>7.4f}') + ## FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) + fg_strengthen_cond = None + fg_weaken_cond = None if curr_diff < 0: if self.is_max_level: + self._bg_next_step = NextStep.STOP return NextStep.STOP else: + self._bg_next_step = NextStep.STRENGTHEN + if fg_weaken_cond: + self._fg_next_step = NextStep.WEAKEN return NextStep.STRENGTHEN - elif curr_diff <= SchedIsolator._FORCE_THRESHOLD: + elif curr_diff <= CoreIsolator._FORCE_THRESHOLD: + self._bg_next_step = NextStep.STOP return NextStep.STOP else: if self.is_min_level: + self._bg_next_step = NextStep.STOP return NextStep.STOP else: + self._bg_next_step = NextStep.WEAKEN + if fg_strengthen_cond: + self._fg_next_step = NextStep.STRENGTHEN return NextStep.WEAKEN def _monitoring_result(self) -> NextStep: @@ -132,16 +143,26 @@ def _monitoring_result(self) -> NextStep: logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - # FIXME: hard coded - if not (24 < self._cur_step < 31) \ - or abs(diff_of_diff) <= SchedIsolator._DOD_THRESHOLD \ - or abs(curr_diff) <= SchedIsolator._DOD_THRESHOLD: + # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) + fg_strengthen_cond = None + fg_weaken_cond = None + max_bg_cpuid = max(self._cpu_topo[self._background_wl.socket_id]) + min_bg_cpuid = min(self._cpu_topo[self._background_wl.socket_id]) + if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid) \ + or abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ + or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: + self._bg_next_step = NextStep.STOP + self._fg_next_step = NextStep.STOP return NextStep.STOP elif curr_diff > 0: self._bg_next_step = NextStep.WEAKEN + if fg_strengthen_cond: + self._fg_next_step = NextStep.STRENGTHEN return NextStep.WEAKEN else: self._bg_next_step = NextStep.STRENGTHEN + if fg_weaken_cond: + self._fg_next_step = NextStep.WEAKEN return NextStep.STRENGTHEN diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 42336ee..75793dd 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -5,7 +5,7 @@ from typing import Mapping, Type from isolating_controller.metric_container.basic_metric import MetricDiff -from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, CoreIsolator from ...workload import Workload @@ -35,7 +35,7 @@ def init_isolators(self) -> None: self._isolator_map = dict(( (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), - (SchedIsolator, SchedIsolator(self._fg_wl, self._bg_wl)) + (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)) )) @property diff --git a/isolating_controller/isolation/policies/diff_policy.py b/isolating_controller/isolation/policies/diff_policy.py index 5757a48..f36c22e 100644 --- a/isolating_controller/isolation/policies/diff_policy.py +++ b/isolating_controller/isolation/policies/diff_policy.py @@ -3,7 +3,7 @@ import logging from .base_policy import IsolationPolicy, ResourceType -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload @@ -47,7 +47,7 @@ def choose_next_isolator(self) -> bool: return True elif not self._is_sched_isolated and resource is ResourceType.MEMORY: - self._cur_isolator = self._isolator_map[SchedIsolator] + self._cur_isolator = self._isolator_map[CoreIsolator] self._is_sched_isolated = True logger.info(f'Cpuset Isolation for {self._fg_wl} is started') return True diff --git a/isolating_controller/isolation/policies/diff_with_violation_policy.py b/isolating_controller/isolation/policies/diff_with_violation_policy.py index db58386..8df9003 100644 --- a/isolating_controller/isolation/policies/diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/diff_with_violation_policy.py @@ -4,7 +4,7 @@ from .base_policy import ResourceType from .diff_policy import DiffPolicy -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload @@ -22,7 +22,7 @@ def _check_violation(self) -> bool: return \ resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, SchedIsolator)) + and not isinstance(self._cur_isolator, CoreIsolator)) @property def new_isolator_needed(self) -> bool: diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index d2b0fc9..a0a4d43 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -3,7 +3,7 @@ import logging from .base_policy import IsolationPolicy, ResourceType -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload @@ -35,7 +35,7 @@ def choose_next_isolator(self) -> bool: return True elif resource is ResourceType.MEMORY: - self._cur_isolator = self._isolator_map[SchedIsolator] + self._cur_isolator = self._isolator_map[CoreIsolator] self._is_mem_isolated = False logger.info(f'Cpuset Isolation for {self._fg_wl} is started') return True diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 84d41ff..980d178 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -4,7 +4,7 @@ from .base_policy import ResourceType from .greedy_diff_policy import GreedyDiffPolicy -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload @@ -22,7 +22,7 @@ def _check_violation(self) -> bool: return \ resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, SchedIsolator)) + and not isinstance(self._cur_isolator, CoreIsolator)) @property def new_isolator_needed(self) -> bool: From 46029a90bb181ed9993d58ca154def4ab5b87a57 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Fri, 21 Sep 2018 17:41:22 +0900 Subject: [PATCH 09/82] fix: Fix hard coded parts and add resctrl.py to utils --- .../isolation/isolators/cache.py | 72 +++++++++++-------- .../isolation/isolators/core.py | 15 ++-- .../isolation/policies/base_policy.py | 1 - isolating_controller/utils/__init__.py | 1 + isolating_controller/utils/resctrl.py | 67 +++++++++++++++++ 5 files changed, 123 insertions(+), 33 deletions(-) create mode 100644 isolating_controller/utils/resctrl.py diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 5def5d2..e7f2002 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -1,11 +1,12 @@ # coding: UTF-8 import logging -from typing import Optional +from typing import Optional, Dict, Set from .base_isolator import Isolator from .. import NextStep -from ...utils import CAT +from ...utils import ResCtrl +from ...utils import NumaTopology from ...workload import Workload @@ -20,33 +21,32 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._cur_step: Optional[int] = None self._fg_grp_name = f'{foreground_wl.name}_{foreground_wl.pid}' - CAT.create_group(self._fg_grp_name) - for tid in foreground_wl.all_child_tid(): - CAT.add_task(self._fg_grp_name, tid) - self._bg_grp_name = f'{background_wl.name}_{background_wl.pid}' - CAT.create_group(self._bg_grp_name) - for tid in background_wl.all_child_tid(): - CAT.add_task(self._bg_grp_name, tid) + + self._fg_resctrl = ResCtrl(self._fg_grp_name) + self._bg_resctrl = ResCtrl(self._bg_grp_name) def __del__(self) -> None: logger = logging.getLogger(__name__) + max_bits = ResCtrl.MAX_BITS + max_mask = ResCtrl.gen_mask(0, max_bits) + if self._foreground_wl.is_running: logger.debug(f'reset resctrl configuration of {self._foreground_wl}') - # FIXME: hard coded - CAT.assign(self._fg_grp_name, '1', CAT.gen_mask(0, CAT.MAX)) + # FIXME: The number of socket is two at most + ResCtrl.assign_llc(self._fg_resctrl, max_mask, max_mask) if self._background_wl.is_running: logger.debug(f'reset resctrl configuration of {self._background_wl}') - # FIXME: hard coded - CAT.assign(self._bg_grp_name, '1', CAT.gen_mask(0, CAT.MAX)) + # FIXME: The number of socket is two at most + ResCtrl.assign_llc(self._bg_resctrl, max_mask, max_mask) def strengthen(self) -> 'CacheIsolator': self._prev_step = self._cur_step if self._cur_step is None: - self._cur_step = CAT.MAX // 2 + self._cur_step = ResCtrl.MAX_BITS // 2 else: self._cur_step += 1 @@ -66,34 +66,50 @@ def weaken(self) -> 'CacheIsolator': @property def is_max_level(self) -> bool: # FIXME: hard coded - return self._cur_step is not None and self._cur_step + CAT.STEP >= CAT.MAX + return self._cur_step is not None and self._cur_step + ResCtrl.STEP >= ResCtrl.MAX_BITS @property def is_min_level(self) -> bool: # FIXME: hard coded - return self._cur_step is None or self._cur_step - CAT.STEP <= CAT.MIN + return self._cur_step is None or self._cur_step - ResCtrl.STEP <= ResCtrl.MIN_BITS def _enforce(self) -> None: logger = logging.getLogger(__name__) + bg_socket_id = self._background_wl.socket_id + fg_socket_id = self._foreground_wl.socket_id + if self._cur_step is None: logger.info('CAT off') - # FIXME: hard coded - mask = CAT.gen_mask(0, CAT.MAX) - CAT.assign(self._fg_grp_name, '1', mask) - CAT.assign(self._bg_grp_name, '1', mask) + # FIXME: The number of socket is two at most + mask = ResCtrl.gen_mask(0, ResCtrl.MAX_BITS) + if bg_socket_id == 0: + ResCtrl.assign_llc(self._bg_resctrl, mask, '1') + if bg_socket_id == 1: + ResCtrl.assign_llc(self._bg_resctrl, '1', mask) + if fg_socket_id == 0: + ResCtrl.assign_llc(self._fg_resctrl, mask, '1') + if fg_socket_id == 1: + ResCtrl.assign_llc(self._fg_resctrl, '1', mask) else: - logger.info(f'foreground : background = {self._cur_step} : {CAT.MAX - self._cur_step}') + logger.info(f'foreground : background = {self._cur_step} : {ResCtrl.MAX_BITS - self._cur_step}') + + # FIXME: The number of socket is two at most + fg_mask = ResCtrl.gen_mask(0, self._cur_step) + if fg_socket_id == 0: + ResCtrl.assign_llc(self._fg_resctrl, fg_mask, '1') + if fg_socket_id == 1: + ResCtrl.assign_llc(self._fg_resctrl, '1', fg_mask) - # FIXME: hard coded - fg_mask = CAT.gen_mask(0, self._cur_step) - CAT.assign(self._fg_grp_name, '1', fg_mask) + # FIXME: The number of socket is two at most + bg_mask = ResCtrl.gen_mask(self._cur_step) + if bg_socket_id == 0: + ResCtrl.assign_llc(self._bg_resctrl, bg_mask, '1') + if bg_socket_id == 1: + ResCtrl.assign_llc(self._bg_resctrl, '1', bg_mask) - # FIXME: hard coded - bg_mask = CAT.gen_mask(self._cur_step) - CAT.assign(self._bg_grp_name, '1', bg_mask) def _first_decision(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() @@ -128,7 +144,7 @@ def _monitoring_result(self) -> NextStep: logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') if self._cur_step is not None \ - and not (CAT.MIN < self._cur_step < CAT.MAX) \ + and not (ResCtrl.MIN_BITS < self._cur_step < ResCtrl.MAX_BITS) \ or abs(diff_of_diff) <= CacheIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= CacheIsolator._DOD_THRESHOLD: return NextStep.STOP diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 034a04a..add83ad 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -26,9 +26,14 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._fg_next_step = NextStep.IDLE self._bg_next_step = NextStep.IDLE + self._fg_grp_name: str = f'{foreground_wl.name}_{foreground_wl.pid}' self._bg_grp_name: str = f'{background_wl.name}_{background_wl.pid}' + + self._prev_fg_affinity: Tuple[int] = foreground_wl.cpuset self._prev_bg_affinity: Tuple[int] = background_wl.cpuset - self._cgroup = Cgroup(self._bg_grp_name, 'cpuset,cpu') + + self._fg_cgroup = Cgroup(self._fg_grp_name, 'cpuset,cpu') + self._bg_cgroup = Cgroup(self._bg_grp_name, 'cpuset,cpu') cpu_topo, mem_topo = NumaTopology.get_numa_info() self._cpu_topo: Dict[int, Set[int]] = cpu_topo @@ -36,7 +41,9 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: def __del__(self) -> None: if self._background_wl.is_running: - self._cgroup.assign_cpus(set(self._prev_bg_affinity)) + self._bg_cgroup.assign_cpus(set(self._prev_bg_affinity)) + if self._foreground_wl.is_running: + self._fg_cgroup.assign_cpus(set(self._prev_fg_affinity)) def strengthen(self) -> 'CoreIsolator': """ @@ -97,8 +104,8 @@ def _enforce(self) -> None: logger.info(f'affinity of background is {hyphen.convert_to_hyphen(self._bg_cpuset)}') logger.info(f'affinity of foreground is {hyphen.convert_to_hyphen(self._fg_cpuset)}') - self._cgroup.assign_cpus(set(self._bg_cpuset)) - self._cgroup.assign_cpus(set(self._fg_cpuset)) + self._bg_cgroup.assign_cpus(set(self._bg_cpuset)) + self._fg_cgroup.assign_cpus(set(self._fg_cpuset)) def _first_decision(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 75793dd..10644af 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -13,7 +13,6 @@ class ResourceType(IntEnum): CACHE = 0 MEMORY = 1 - class IsolationPolicy(metaclass=ABCMeta): _IDLE_ISOLATOR: IdleIsolator = IdleIsolator() diff --git a/isolating_controller/utils/__init__.py b/isolating_controller/utils/__init__.py index ce410ef..fad398f 100644 --- a/isolating_controller/utils/__init__.py +++ b/isolating_controller/utils/__init__.py @@ -1,6 +1,7 @@ # coding: UTF-8 from .cat import CAT +from .resctrl import ResCtrl from .cgroup_cpuset import CgroupCpuset from .cgroup import Cgroup from .dvfs import DVFS diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py new file mode 100644 index 0000000..d7f13ed --- /dev/null +++ b/isolating_controller/utils/resctrl.py @@ -0,0 +1,67 @@ +# coding: UTF-8 + +import subprocess +import asyncio +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + +#import aiofiles +#from aiofiles.base import AiofilesContextManager + + +def len_of_mask(mask: str) -> int: + cnt = 0 + num = int(mask, 16) + while num is not 0: + cnt += 1 + num >>= 1 + return cnt + + +def bits_to_mask(bits: int) -> str: + return f'{bits:x}' + + +class ResCtrl: + MOUNT_POINT: Path = Path('/sys/fs/resctrl') + MAX_MASK: str = Path('/sys/fs/resctrl/info/L3/cbm_mask').read_text(encoding='ASCII').strip() + MAX_BITS: int = len_of_mask((MOUNT_POINT / 'info' / 'L3' / 'cbm_mask').read_text()) + MIN_BITS: int = int((MOUNT_POINT / 'info' / 'L3' / 'min_cbm_bits').read_text()) + MIN_MASK: str = bits_to_mask(MIN_BITS) + STEP = 1 + + def __init__(self, group_name: str) -> None: + self._group_name: str = group_name + self._group_path: Path = ResCtrl.MOUNT_POINT/f'{group_name}' + + @property + def group_name(self): + return self._group_name + + @group_name.setter + def group_name(self, new_name): + self._group_name = new_name + self._group_path: Path = ResCtrl.MOUNT_POINT / new_name + + def add_task(self, pid: int) -> None: + subprocess.run(args=('sudo', 'tee', str(self._group_path / 'tasks')), + input=f'{pid}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) + + def assign_llc(self, *masks: str) -> None: + masks = (f'{i}={mask}' for i, mask in enumerate(masks)) + mask = ';'.join(masks) + subprocess.run(args=('sudo', 'tee', str(ResCtrl.MOUNT_POINT / self._group_name / 'schemata')), + input=f'L3:{mask}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) + + @staticmethod + def gen_mask(start: int, end: int = None) -> str: + if end is None or end > ResCtrl.MAX_BITS: + end = ResCtrl.MAX_BITS + + if start < 0: + raise ValueError('start must be greater than 0') + + return format(((1 << (end - start)) - 1) << (ResCtrl.MAX_BITS - end), 'x') + + def remove_group(self) -> None: + subprocess.check_call(args=('sudo', 'rmdir', str(ResCtrl.MOUNT_POINT / self._group_name))) From 40b29ebf976b7f9d42dd3e5898e1c3892020bbbb Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Fri, 21 Sep 2018 17:42:05 +0900 Subject: [PATCH 10/82] feat: Add Swap related code --- controller.py | 2 ++ .../isolation/isolators/swap.py | 31 +++++++++++++------ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/controller.py b/controller.py index ca39610..78528e6 100755 --- a/controller.py +++ b/controller.py @@ -143,6 +143,8 @@ def __init__(self, pending_queue: PendingQueue) -> None: def _isolate_workloads(self) -> None: logger = logging.getLogger(__name__) + ##TODO: Swapper may come here + for group, iteration_num in self._isolation_groups.items(): logger.info('') logger.info(f'***************isolation of {group.name} #{iteration_num}***************') diff --git a/isolating_controller/isolation/isolators/swap.py b/isolating_controller/isolation/isolators/swap.py index 3be9aca..2bcdbcd 100644 --- a/isolating_controller/isolation/isolators/swap.py +++ b/isolating_controller/isolation/isolators/swap.py @@ -18,6 +18,8 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload, self._all_groups = isolation_groups self._swap_candidates: Set[Workload] = None + self._most_contentious_group = None + self._most_contentious_workload = None def __del__(self): logger = logging.getLogger(__name__) @@ -28,16 +30,16 @@ def __del__(self): logger.debug(f'reset swap configuration of {self._background_wl}') - def strengthen(self) -> 'Isolator': + def strengthen(self) -> 'SwapIsolator': """ Choosing which contentious workloads to swap out to other socket :return: """ # FIXME: hard coded (two sockets) - ## Estimating the socket contention - ## + ## 1.Estimating and selecting the most contentious workloads from the socket of cur_group + ## 2. - return + return self @property def is_max_level(self) -> bool: @@ -46,9 +48,9 @@ def is_max_level(self) -> bool: e.g., There is no searchable candidate to strengthen the degree of isolation :return: """ - # FIXME: + # FIXME: hard coded + return self._swap_candidates == None - return False @property def is_min_level(self) -> bool: @@ -57,17 +59,18 @@ def is_min_level(self) -> bool: e.g., There is no searchable candidate to weaken the degree of isolation :return: """ - # FIXME: + # FIXME: hard coded + return self._swap_candidates == None - return False - def weaken(self) -> 'Isolator': + def weaken(self) -> 'SwapIsolator': """ Choosing which contentious workloads to swap in from other socket :return: """ # FIXME: hard coded (two sockets) - pass + ## 1.Estimating and selecting the most contentious workloads from the socket of other_group + return self def _enforce(self) -> None: """ @@ -82,7 +85,15 @@ def _enforce(self) -> None: # self._enforce() def _first_decision(self) -> NextStep: + """ + How to choose the first candidate? + :return: + """ pass def _monitoring_result(self) -> NextStep: + """ + If the effect of swapping is getting worse, then rollback?? + :return: + """ pass From 43473089610a67093a74a11fb0bed369fde2dd74 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Sun, 23 Sep 2018 22:46:11 +0900 Subject: [PATCH 11/82] feat: Separate fg and bg in CoreIsolator --- controller.py | 10 +- .../isolation/isolators/base_isolator.py | 3 + .../isolation/isolators/core.py | 99 +++++++++++++++---- .../isolation/isolators/idle.py | 4 + .../metric_container/basic_metric.py | 7 +- isolating_controller/workload.py | 2 +- pending_queue.py | 33 ++++--- 7 files changed, 115 insertions(+), 43 deletions(-) diff --git a/controller.py b/controller.py index 78528e6..cf90303 100755 --- a/controller.py +++ b/controller.py @@ -20,7 +20,7 @@ import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator -from isolating_controller.isolation.policies import DiffPolicy, IsolationPolicy +from isolating_controller.isolation.policies import GreedyDiffWViolationPolicy, IsolationPolicy from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue @@ -46,7 +46,7 @@ def __init__(self, metric_buf_size: int) -> None: self._rmq_creation_queue = 'workload_creation' ## FIXME : Hard coded - PendingQueue can have four workloads at most (second argument) - self._pending_wl = PendingQueue(DiffPolicy, 4) + self._pending_wl = PendingQueue(GreedyDiffWViolationPolicy, 2) self._control_thread = ControlThread(self._pending_wl) self._lock = RLock() @@ -61,10 +61,12 @@ def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicP if len(arr) != 5: return - wl_name, wl_type, pid, perf_pid, perf_interval = arr + wl_identifier, wl_type, pid, perf_pid, perf_interval = arr pid = int(pid) perf_pid = int(perf_pid) perf_interval = int(perf_interval) + item = wl_identifier.split('_') + wl_name = item[0] if not psutil.pid_exists(pid): return @@ -137,7 +139,7 @@ def __init__(self, pending_queue: PendingQueue) -> None: self._pending_queue: PendingQueue = pending_queue - self._interval: int = 2 # Scheduling interval + self._interval: float = 0.2 # Scheduling interval self._isolation_groups: Dict[IsolationPolicy, int] = dict() def _isolate_workloads(self) -> None: diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index 264696c..93a9ed7 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -14,6 +14,9 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._foreground_wl = foreground_wl self._background_wl = background_wl + self._fg_next_step = NextStep.IDLE + self._bg_next_step = NextStep.IDLE + self._is_first_decision: bool = True @abstractmethod diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index add83ad..325210c 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -23,8 +23,8 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._cur_bg_step: int = min(self._bg_cpuset) self._cur_fg_step: int = max(self._fg_cpuset) - self._fg_next_step = NextStep.IDLE - self._bg_next_step = NextStep.IDLE + #self._fg_next_step = NextStep.IDLE + #self._bg_next_step = NextStep.IDLE self._fg_grp_name: str = f'{foreground_wl.name}_{foreground_wl.pid}' self._bg_grp_name: str = f'{background_wl.name}_{background_wl.pid}' @@ -52,14 +52,20 @@ def strengthen(self) -> 'CoreIsolator': :return: """ # NOTE: Caller is assumed that BG workload + logger = logging.getLogger(__name__) + logger.info(f'self._cur_bg_step: {self._cur_bg_step}') + logger.info(f'self._cur_fg_step: {self._cur_fg_step}') + logger.info(f'self._bg_next_step: {self._bg_next_step.name}') + logger.info(f'self._fg_next_step: {self._fg_next_step.name}') + if self._bg_next_step == NextStep.STRENGTHEN: - self._cur_bg_step += 1 bg_cpuset = set(self._bg_cpuset) bg_cpuset.remove(self._cur_bg_step) self._bg_cpuset = tuple(bg_cpuset) + self._cur_bg_step += 1 if self._fg_next_step == NextStep.WEAKEN: - self._cur_fg_step += 1 fg_cpuset = set(self._fg_cpuset) + self._cur_fg_step += 1 fg_cpuset.add(self._cur_fg_step) self._fg_cpuset = tuple(fg_cpuset) return self @@ -71,33 +77,56 @@ def weaken(self) -> 'CoreIsolator': :return: """ # NOTE: Caller is assumed that BG workload + logger = logging.getLogger(__name__) + logger.info(f'self._cur_bg_step: {self._cur_bg_step}') + logger.info(f'self._cur_fg_step: {self._cur_fg_step}') + logger.info(f'self._bg_next_step: {self._bg_next_step.name}') + logger.info(f'self._fg_next_step: {self._fg_next_step.name}') + if self._bg_next_step == NextStep.WEAKEN: - self._cur_bg_step -= 1 bg_cpuset = set(self._bg_cpuset) + self._cur_bg_step -= 1 bg_cpuset.add(self._cur_bg_step) self._bg_cpuset = tuple(bg_cpuset) if self._fg_next_step == NextStep.STRENGTHEN: - self._cur_fg_step -= 1 fg_cpuset = set(self._fg_cpuset) fg_cpuset.remove(self._cur_fg_step) self._fg_cpuset = tuple(fg_cpuset) + self._cur_fg_step -= 1 return self @property def is_max_level(self) -> bool: + logger = logging.getLogger(__name__) + logger.info(f'bg max cpuset: {max(self._cpu_topo[self._background_wl.socket_id])}') + logger.info(f'self._cur_bg_step: {self._cur_bg_step}') + logger.info(f'self._cur_fg_step: {self._cur_fg_step}') + logger.info(f'self._bg_next_step: {self._bg_next_step.name}') + logger.info(f'self._fg_next_step: {self._fg_next_step.name}') # FIXME: How about first condition is true but the other is false? - if self._bg_next_step == NextStep.STRENGTHEN: - return self._cur_bg_step == max(self._cpu_topo[self._background_wl.socket_id]) - if self._fg_next_step == NextStep.WEAKEN: - return self._cur_fg_step == self._cur_bg_step-1 + if self._cur_bg_step == max(self._cpu_topo[self._background_wl.socket_id]): + self._bg_next_step = NextStep.STOP + return True + #if self._cur_fg_step == self._cur_bg_step-1: + # self._fg_next_step = NextStep.STOP + else: + return False @property def is_min_level(self) -> bool: + logger = logging.getLogger(__name__) + logger.info(f'self._cur_bg_step: {self._cur_bg_step}') + logger.info(f'self._cur_fg_step: {self._cur_fg_step}') + logger.info(f'self._bg_next_step: {self._bg_next_step.name}') + logger.info(f'self._fg_next_step: {self._fg_next_step.name}') + # FIXME: How about first condition is true but the other is false? - if self._bg_next_step == NextStep.WEAKEN: - return self._cur_bg_step == self._cur_fg_step+1 - if self._fg_next_step == NextStep.STRENGTHEN: - return self._cur_fg_step == min(self._cpu_topo[self._foreground_wl.socket_id]) + if self._cur_bg_step == self._cur_fg_step+1: + return True + #if self._cur_fg_step == min(self._cpu_topo[self._foreground_wl.socket_id]): + # return True + else: + return False def _enforce(self) -> None: logger = logging.getLogger(__name__) @@ -151,25 +180,53 @@ def _monitoring_result(self) -> NextStep: logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) - fg_strengthen_cond = None - fg_weaken_cond = None + fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.ipc) + fg_weaken_cond = self.fg_weaken_cond(metric_diff.ipc) + + logger = logging.getLogger(__name__) + logger.info(f'metric_diff.ipc: {metric_diff.ipc}') + logger.info(f'self.fg_strengthen_cond: {fg_strengthen_cond}') + logger.info(f'self.fg_weaken_cond: {fg_weaken_cond}') + + # FIXME: Assumption about fg's cpuset IDs are smaller than bg's ones. (kind of hard coded) max_bg_cpuid = max(self._cpu_topo[self._background_wl.socket_id]) - min_bg_cpuid = min(self._cpu_topo[self._background_wl.socket_id]) - if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid) \ - or abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ - or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: + min_bg_cpuid = max(self._fg_cpuset)+1 + + # Case1 : diff is too small to perform isolation + if abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ + or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: self._bg_next_step = NextStep.STOP - self._fg_next_step = NextStep.STOP + self._fg_next_step = NextStep.STOP # This line depends on bg status return NextStep.STOP + # Case2 : FG shows lower contention than solo-run -> Slower FG or Faster BG elif curr_diff > 0: self._bg_next_step = NextStep.WEAKEN + if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid): + self._bg_next_step = NextStep.STOP if fg_strengthen_cond: self._fg_next_step = NextStep.STRENGTHEN return NextStep.WEAKEN + # Case3 : FG shows higher contention than solo-run else: self._bg_next_step = NextStep.STRENGTHEN + if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid): + self._bg_next_step = NextStep.STOP if fg_weaken_cond: self._fg_next_step = NextStep.WEAKEN return NextStep.STRENGTHEN + + @staticmethod + def fg_strengthen_cond(fg_ipc_diff) -> bool: + if fg_ipc_diff > 0: + return True + else: + return False + + @staticmethod + def fg_weaken_cond(fg_ipc_diff) -> bool: + if fg_ipc_diff <= 0: + return True + else: + return False diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index d720a59..9a2ec15 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -26,10 +26,14 @@ def _enforce(self) -> None: pass def _first_decision(self) -> NextStep: + self._fg_next_step = NextStep.IDLE + self._bg_next_step = NextStep.IDLE return NextStep.IDLE def decide_next_step(self) -> NextStep: return self._monitoring_result() def _monitoring_result(self) -> NextStep: + self._fg_next_step = NextStep.IDLE + self._bg_next_step = NextStep.IDLE return NextStep.IDLE diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index fcb79d4..eb889c7 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -71,7 +71,7 @@ def req_date(self): return self._req_date @property - def ipc(self): + def ipc(self) -> float: return self._instructions / self._cycles @property @@ -113,6 +113,7 @@ def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: self._l3_hit_ratio = curr.l3hit_ratio - prev.l3hit_ratio self._local_mem_ps = curr.local_mem_ps() / prev.local_mem_ps() - 1 self._remote_mem_ps = curr.remote_mem_ps() / prev.remote_mem_ps() - 1 + self._ipc = curr.ipc - prev.ipc @property def l3_hit_ratio(self): @@ -126,5 +127,9 @@ def local_mem_util_ps(self): def remote_mem_ps(self): return self._remote_mem_ps + @property + def ipc(self): + return self._ipc + def __repr__(self) -> str: return f'L3 hit ratio diff: {self._l3_hit_ratio:>6.03f}, Local Memory access diff: {self._local_mem_ps:>6.03f}' diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 46f614c..805f2a1 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -94,7 +94,7 @@ def get_socket_id(self) -> int: # FIXME: Hardcode for assumption (one workload to one socket) for socket_id, skt_cpus in cpu_topo.items(): - print(f'cpuset: {cpuset}, socket_id: {socket_id}, skt_cpus: {skt_cpus}') + #print(f'cpuset: {cpuset}, socket_id: {socket_id}, skt_cpus: {skt_cpus}') for cpu_id in cpuset: if cpu_id in skt_cpus: ret = socket_id diff --git a/pending_queue.py b/pending_queue.py index 877d975..88d2fe9 100644 --- a/pending_queue.py +++ b/pending_queue.py @@ -80,22 +80,23 @@ def dump_to_pending_list(self) -> None: # Grouping pids based on their types and skt_id for node in node_list: node_pidset = group_pids[node] - pid = node_pidset.pop() - print(f'Pop {pid}!') - if pid in fg_pids: - bg_pid = node_pidset.pop() - print(f'Pop {bg_pid}!') - new_group = self._policy_type(self._fg_q[pid], self._bg_q[bg_pid], node) - self._pending_list.append(new_group) - del self._fg_q[pid] - del self._bg_q[bg_pid] - elif pid in bg_pids: - fg_pid = node_pidset.pop() - print(f'Pop {fg_pid}!') - new_group = self._policy_type(self._fg_q[fg_pid], self._bg_q[pid], node) - self._pending_list.append(new_group) - del self._fg_q[fg_pid] - del self._bg_q[pid] + if len(node_pidset) > 0: + pid = node_pidset.pop() + print(f'Pop {pid}!') + if pid in fg_pids: + bg_pid = node_pidset.pop() + print(f'Pop {bg_pid}!') + new_group = self._policy_type(self._fg_q[pid], self._bg_q[bg_pid], node) + self._pending_list.append(new_group) + del self._fg_q[pid] + del self._bg_q[bg_pid] + elif pid in bg_pids: + fg_pid = node_pidset.pop() + print(f'Pop {fg_pid}!') + new_group = self._policy_type(self._fg_q[fg_pid], self._bg_q[pid], node) + self._pending_list.append(new_group) + del self._fg_q[fg_pid] + del self._bg_q[pid] return def update_max_pending(self, new_max_pending: int): From d559567cc19103e352399c3c8ce9107253340ac0 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Mon, 24 Sep 2018 15:32:37 +0900 Subject: [PATCH 12/82] feat: Add ResourceType.CPU and related logics --- controller.py | 6 +- .../isolation/isolators/base_isolator.py | 6 +- .../isolation/isolators/cache.py | 2 +- .../isolation/isolators/core.py | 7 +-- .../isolation/isolators/swap.py | 2 +- .../isolation/policies/__init__.py | 1 + .../isolation/policies/base_policy.py | 11 +++- .../isolation/policies/diff_policy.py | 6 +- .../isolation/policies/diff_policy_cpu.py | 61 +++++++++++++++++++ .../isolation/policies/greedy_diff_policy.py | 15 +++-- 10 files changed, 95 insertions(+), 22 deletions(-) create mode 100644 isolating_controller/isolation/policies/diff_policy_cpu.py diff --git a/controller.py b/controller.py index cf90303..20b2020 100755 --- a/controller.py +++ b/controller.py @@ -20,7 +20,7 @@ import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator -from isolating_controller.isolation.policies import GreedyDiffWViolationPolicy, IsolationPolicy +from isolating_controller.isolation.policies import GreedyDiffWViolationPolicy, DiffCPUPolicy, IsolationPolicy from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue @@ -46,7 +46,7 @@ def __init__(self, metric_buf_size: int) -> None: self._rmq_creation_queue = 'workload_creation' ## FIXME : Hard coded - PendingQueue can have four workloads at most (second argument) - self._pending_wl = PendingQueue(GreedyDiffWViolationPolicy, 2) + self._pending_wl = PendingQueue(DiffCPUPolicy, 2) self._control_thread = ControlThread(self._pending_wl) self._lock = RLock() @@ -145,7 +145,7 @@ def __init__(self, pending_queue: PendingQueue) -> None: def _isolate_workloads(self) -> None: logger = logging.getLogger(__name__) - ##TODO: Swapper may come here + ## TODO: Swapper may come here for group, iteration_num in self._isolation_groups.items(): logger.info('') diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index 93a9ed7..af129bd 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -2,13 +2,16 @@ from abc import ABCMeta, abstractmethod +from typing import Optional + from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...workload import Workload +from ..policies.base_policy import IsolationPolicy, ResourceType class Isolator(metaclass=ABCMeta): - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: + def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: self._prev_metric_diff: MetricDiff = foreground_wl.calc_metric_diff() self._foreground_wl = foreground_wl @@ -18,6 +21,7 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._bg_next_step = NextStep.IDLE self._is_first_decision: bool = True + self._contentious_resource: Optional[ResourceType] = cont_resource @abstractmethod def strengthen(self) -> 'Isolator': diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index e7f2002..99d0bb3 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -15,7 +15,7 @@ class CacheIsolator(Isolator): _FORCE_THRESHOLD = 0.1 def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: - super().__init__(foreground_wl, background_wl) + super().__init__(foreground_wl, background_wl, None) self._prev_step: Optional[int] = None self._cur_step: Optional[int] = None diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 325210c..b1a14f4 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -16,16 +16,13 @@ class CoreIsolator(Isolator): _FORCE_THRESHOLD = 0.1 def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: - super().__init__(foreground_wl, background_wl) + super().__init__(foreground_wl, background_wl, None) self._fg_cpuset: Tuple[int] = foreground_wl.cpuset self._bg_cpuset: Tuple[int] = background_wl.cpuset self._cur_bg_step: int = min(self._bg_cpuset) self._cur_fg_step: int = max(self._fg_cpuset) - #self._fg_next_step = NextStep.IDLE - #self._bg_next_step = NextStep.IDLE - self._fg_grp_name: str = f'{foreground_wl.name}_{foreground_wl.pid}' self._bg_grp_name: str = f'{background_wl.name}_{background_wl.pid}' @@ -196,7 +193,7 @@ def _monitoring_result(self) -> NextStep: if abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: self._bg_next_step = NextStep.STOP - self._fg_next_step = NextStep.STOP # This line depends on bg status + #self._fg_next_step = NextStep.STOP # This line depends on bg status return NextStep.STOP # Case2 : FG shows lower contention than solo-run -> Slower FG or Faster BG diff --git a/isolating_controller/isolation/isolators/swap.py b/isolating_controller/isolation/isolators/swap.py index 2bcdbcd..bedc04e 100644 --- a/isolating_controller/isolation/isolators/swap.py +++ b/isolating_controller/isolation/isolators/swap.py @@ -14,7 +14,7 @@ class SwapIsolator(Isolator): def __init__(self, foreground_wl: Workload, background_wl: Workload, isolation_groups: Dict[IsolationPolicy, int]) -> None: - super().__init__(foreground_wl, background_wl) + super().__init__(foreground_wl, background_wl, None) self._all_groups = isolation_groups self._swap_candidates: Set[Workload] = None diff --git a/isolating_controller/isolation/policies/__init__.py b/isolating_controller/isolation/policies/__init__.py index 6f9c6ee..bde7236 100644 --- a/isolating_controller/isolation/policies/__init__.py +++ b/isolating_controller/isolation/policies/__init__.py @@ -2,6 +2,7 @@ from .base_policy import IsolationPolicy, ResourceType from .diff_policy import DiffPolicy +from .diff_policy_cpu import DiffCPUPolicy from .diff_with_violation_policy import DiffWViolationPolicy from .greedy_diff_policy import GreedyDiffPolicy from .greedy_diff_with_violation_policy import GreedyDiffWViolationPolicy diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 10644af..1259ded 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -10,11 +10,15 @@ class ResourceType(IntEnum): - CACHE = 0 - MEMORY = 1 + CPU = 0 + CACHE = 1 + MEMORY = 2 + class IsolationPolicy(metaclass=ABCMeta): _IDLE_ISOLATOR: IdleIsolator = IdleIsolator() + # FIXME : _CPU_THRESHOLD needs test + _CPU_THRESHOLD = 0.01 def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: self._fg_wl = fg_wl @@ -51,6 +55,9 @@ def contentious_resource(self) -> ResourceType: logger = logging.getLogger(__name__) logger.info(repr(metric_diff)) + if abs(metric_diff.local_mem_util_ps) < IsolationPolicy._CPU_THRESHOLD \ + and abs(metric_diff.l3_hit_ratio) < IsolationPolicy._CPU_THRESHOLD: + return ResourceType.CPU if metric_diff.local_mem_util_ps > 0 and metric_diff.l3_hit_ratio > 0: if metric_diff.l3_hit_ratio > metric_diff.local_mem_util_ps: diff --git a/isolating_controller/isolation/policies/diff_policy.py b/isolating_controller/isolation/policies/diff_policy.py index f36c22e..00413ff 100644 --- a/isolating_controller/isolation/policies/diff_policy.py +++ b/isolating_controller/isolation/policies/diff_policy.py @@ -30,10 +30,6 @@ def choose_next_isolator(self) -> bool: resource: ResourceType = self.contentious_resource() - if self._is_sched_isolated and self._is_mem_isolated and self._is_llc_isolated: - self._clear_flags() - logger.debug('****All isolators are applicable for now!****') - if not self._is_llc_isolated and resource is ResourceType.CACHE: self._cur_isolator = self._isolator_map[CacheIsolator] self._is_llc_isolated = True @@ -49,7 +45,7 @@ def choose_next_isolator(self) -> bool: elif not self._is_sched_isolated and resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[CoreIsolator] self._is_sched_isolated = True - logger.info(f'Cpuset Isolation for {self._fg_wl} is started') + logger.info(f'Core Isolation for {self._fg_wl} is started') return True else: diff --git a/isolating_controller/isolation/policies/diff_policy_cpu.py b/isolating_controller/isolation/policies/diff_policy_cpu.py new file mode 100644 index 0000000..8289703 --- /dev/null +++ b/isolating_controller/isolation/policies/diff_policy_cpu.py @@ -0,0 +1,61 @@ +# coding: UTF-8 + +import logging + +from .base_policy import IsolationPolicy, ResourceType +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator +from ...workload import Workload + + +class DiffCPUPolicy(IsolationPolicy): + def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: + super().__init__(fg_wl, bg_wl, skt_id) + + self._is_llc_isolated = False + self._is_mem_isolated = False + self._is_core_isolated = False + + @property + def new_isolator_needed(self) -> bool: + return isinstance(self._cur_isolator, IdleIsolator) + + def _clear_flags(self) -> None: + self._is_llc_isolated = False + self._is_mem_isolated = False + self._is_core_isolated = False + + def choose_next_isolator(self) -> bool: + logger = logging.getLogger(__name__) + logger.debug('looking for new isolation...') + + resource: ResourceType = self.contentious_resource() + + if not self._is_core_isolated and resource is ResourceType.CPU: + self._cur_isolator = self._isolator_map[CoreIsolator] + self._cur_isolator._contentious_resource = ResourceType.CPU + logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') + return True + + elif not self._is_llc_isolated and resource is ResourceType.CACHE: + self._cur_isolator = self._isolator_map[CacheIsolator] + self._is_llc_isolated = True + logger.info(f'Cache Isolation for {self._fg_wl} is started to isolate {ResourceType.CACHE.name}s') + return True + + elif not self._is_mem_isolated and resource is ResourceType.MEMORY: + self._cur_isolator = self._isolator_map[MemoryIsolator] + self._is_mem_isolated = True + logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started ' + f'to isolate {ResourceType.MEMORY.name} BW') + return True + + elif not self._is_core_isolated and resource is ResourceType.MEMORY: + self._cur_isolator = self._isolator_map[CoreIsolator] + self._is_core_isolated = True + self._cur_isolator._contentious_resource = ResourceType.MEMORY + logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW ') + return True + + else: + logger.debug('A new Isolator has not been selected.') + return False diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index a0a4d43..f6f801c 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -23,21 +23,28 @@ def choose_next_isolator(self) -> bool: resource: ResourceType = self.contentious_resource() - if resource is ResourceType.CACHE: + if resource is ResourceType.CPU: + self._cur_isolator = self._isolator_map[CoreIsolator] + self._cur_isolator._contentious_resource = ResourceType.CPU + logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') + return True + + elif resource is ResourceType.CACHE: self._cur_isolator = self._isolator_map[CacheIsolator] - logger.info(f'Cache Isolation for {self._fg_wl} is started') + logger.info(f'Cache Isolation for {self._fg_wl} is started to isolate {ResourceType.CACHE.name}s') return True elif not self._is_mem_isolated and resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[MemoryIsolator] self._is_mem_isolated = True - logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started') + logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started ' + f'to isolate {ResourceType.MEMORY.name} BW') return True elif resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[CoreIsolator] self._is_mem_isolated = False - logger.info(f'Cpuset Isolation for {self._fg_wl} is started') + logger.info(f'Cpuset Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') return True else: From 62858e9bc105a73f886464f5628802e704e71dca Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Mon, 24 Sep 2018 18:26:56 +0900 Subject: [PATCH 13/82] fix: Change the diff policies for ResourceType.CPU --- .../isolation/isolators/base_isolator.py | 2 +- .../isolation/isolators/core.py | 20 +++++--- .../isolation/policies/diff_policy.py | 8 +-- .../isolation/policies/diff_policy_cpu.py | 2 +- .../metric_container/basic_metric.py | 3 +- isolating_controller/utils/__init__.py | 1 - isolating_controller/utils/cgroup_cpuset.py | 50 ------------------- 7 files changed, 22 insertions(+), 64 deletions(-) delete mode 100644 isolating_controller/utils/cgroup_cpuset.py diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index af129bd..2c119d4 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -7,7 +7,7 @@ from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...workload import Workload -from ..policies.base_policy import IsolationPolicy, ResourceType +from ..policies.base_policy import ResourceType class Isolator(metaclass=ABCMeta): diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index b1a14f4..33fa15d 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -10,6 +10,7 @@ from ...utils import Cgroup from ...utils import NumaTopology from ...utils import hyphen +from ..policies.base_policy import ResourceType class CoreIsolator(Isolator): _DOD_THRESHOLD = 0.005 @@ -141,8 +142,8 @@ def _first_decision(self) -> NextStep: logger.debug(f'current diff: {curr_diff:>7.4f}') ## FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) - fg_strengthen_cond = None - fg_weaken_cond = None + fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.ipc) + fg_weaken_cond = self.fg_weaken_cond(metric_diff.ipc) if curr_diff < 0: if self.is_max_level: self._bg_next_step = NextStep.STOP @@ -167,10 +168,17 @@ def _first_decision(self) -> NextStep: def _monitoring_result(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() - - curr_diff = metric_diff.local_mem_util_ps - prev_diff = self._prev_metric_diff.local_mem_util_ps - diff_of_diff = curr_diff - prev_diff + curr_diff = None + diff_of_diff = None + + if self._contentious_resource == ResourceType.MEMORY: + curr_diff = metric_diff.local_mem_util_ps + prev_diff = self._prev_metric_diff.local_mem_util_ps + diff_of_diff = curr_diff - prev_diff + elif self._contentious_resource == ResourceType.CPU: + curr_diff = metric_diff.ipc + prev_diff = self._prev_metric_diff.ipc + diff_of_diff = curr_diff - prev_diff logger = logging.getLogger(__name__) logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') diff --git a/isolating_controller/isolation/policies/diff_policy.py b/isolating_controller/isolation/policies/diff_policy.py index 00413ff..84ae81f 100644 --- a/isolating_controller/isolation/policies/diff_policy.py +++ b/isolating_controller/isolation/policies/diff_policy.py @@ -13,7 +13,7 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: self._is_llc_isolated = False self._is_mem_isolated = False - self._is_sched_isolated = False + self._is_core_isolated = False @property def new_isolator_needed(self) -> bool: @@ -22,7 +22,7 @@ def new_isolator_needed(self) -> bool: def _clear_flags(self) -> None: self._is_llc_isolated = False self._is_mem_isolated = False - self._is_sched_isolated = False + self._is_core_isolated = False def choose_next_isolator(self) -> bool: logger = logging.getLogger(__name__) @@ -42,9 +42,9 @@ def choose_next_isolator(self) -> bool: logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started') return True - elif not self._is_sched_isolated and resource is ResourceType.MEMORY: + elif not self._is_core_isolated and resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[CoreIsolator] - self._is_sched_isolated = True + self._is_core_isolated = True logger.info(f'Core Isolation for {self._fg_wl} is started') return True diff --git a/isolating_controller/isolation/policies/diff_policy_cpu.py b/isolating_controller/isolation/policies/diff_policy_cpu.py index 8289703..bf904bb 100644 --- a/isolating_controller/isolation/policies/diff_policy_cpu.py +++ b/isolating_controller/isolation/policies/diff_policy_cpu.py @@ -30,7 +30,7 @@ def choose_next_isolator(self) -> bool: resource: ResourceType = self.contentious_resource() - if not self._is_core_isolated and resource is ResourceType.CPU: + if resource is ResourceType.CPU: self._cur_isolator = self._isolator_map[CoreIsolator] self._cur_isolator._contentious_resource = ResourceType.CPU logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index eb889c7..371184f 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -132,4 +132,5 @@ def ipc(self): return self._ipc def __repr__(self) -> str: - return f'L3 hit ratio diff: {self._l3_hit_ratio:>6.03f}, Local Memory access diff: {self._local_mem_ps:>6.03f}' + return f'L3 hit ratio diff: {self._l3_hit_ratio:>6.03f}, Local Memory access diff: {self._local_mem_ps:>6.03f},' \ + f'IPC diff: {self.ipc:>06.03f}' diff --git a/isolating_controller/utils/__init__.py b/isolating_controller/utils/__init__.py index fad398f..9902838 100644 --- a/isolating_controller/utils/__init__.py +++ b/isolating_controller/utils/__init__.py @@ -2,7 +2,6 @@ from .cat import CAT from .resctrl import ResCtrl -from .cgroup_cpuset import CgroupCpuset from .cgroup import Cgroup from .dvfs import DVFS from .numa_topology import NumaTopology diff --git a/isolating_controller/utils/cgroup_cpuset.py b/isolating_controller/utils/cgroup_cpuset.py deleted file mode 100644 index 0877bf1..0000000 --- a/isolating_controller/utils/cgroup_cpuset.py +++ /dev/null @@ -1,50 +0,0 @@ -# coding: UTF-8 - -import subprocess -from typing import Set - -import psutil - - -class CgroupCpuset: - MOUNT_POINT = '/sys/fs/cgroup/cpuset' - - @staticmethod - def create_group(name: str) -> None: - subprocess.check_call(args=('sudo', 'mkdir', '-p', f'{CgroupCpuset.MOUNT_POINT}/{name}')) - - @staticmethod - def add_task(name: str, pid: int) -> None: - p = psutil.Process(pid) - - for thread in p.threads(): - subprocess.run(args=('sudo', 'tee', '-a', f'{CgroupCpuset.MOUNT_POINT}/{name}/tasks'), - input=f'{thread.id}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) - - for child in p.children(True): - for thread in child.threads(): - subprocess.run(args=('sudo', 'tee', '-a', f'{CgroupCpuset.MOUNT_POINT}/{name}/tasks'), - input=f'{thread.id}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) - - @staticmethod - def remove_group(name: str) -> None: - subprocess.check_call(args=('sudo', 'rmdir', f'/sys/fs/cgroup/cpuset/{name}')) - - @staticmethod - def assign(group_name: str, core_set: Set[int]) -> None: - subprocess.run(args=('sudo', 'tee', f'/sys/fs/cgroup/cpuset/{group_name}/cpuset.cpus'), - input=','.join(map(str, core_set)), check=True, encoding='ASCII', stdout=subprocess.DEVNULL) - - @staticmethod - def convert_to_set(hyphen_str: str) -> Set[int]: - ret = set() - - for elem in hyphen_str.split(','): - group = tuple(map(int, elem.split('-'))) - - if len(group) is 1: - ret.add(group[0]) - elif len(group) is 2: - ret.update(range(group[0], group[1] + 1)) - - return ret From 656d69216e65341f85c024c1e278d98e33efa1a7 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Tue, 25 Sep 2018 06:38:42 +0900 Subject: [PATCH 14/82] fix: Fix fg_strengthen/weaken_cond not to exceed socket boudnary --- controller.py | 2 +- isolating_controller/isolation/__init__.py | 7 ++ .../isolation/isolators/base_isolator.py | 3 +- .../isolation/isolators/cache.py | 9 ++- .../isolation/isolators/core.py | 67 ++++++++++++------- .../isolation/isolators/memory.py | 8 ++- .../isolation/isolators/swap.py | 1 + .../isolation/policies/__init__.py | 2 +- .../isolation/policies/base_policy.py | 25 +++---- .../isolation/policies/diff_policy.py | 3 +- .../isolation/policies/diff_policy_cpu.py | 3 +- .../policies/diff_with_violation_policy.py | 2 +- .../isolation/policies/greedy_diff_policy.py | 3 +- .../greedy_diff_with_violation_policy.py | 2 +- .../metric_container/basic_metric.py | 17 ++++- pending_queue.py | 1 - 16 files changed, 96 insertions(+), 59 deletions(-) diff --git a/controller.py b/controller.py index 20b2020..1497d7d 100755 --- a/controller.py +++ b/controller.py @@ -20,7 +20,7 @@ import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator -from isolating_controller.isolation.policies import GreedyDiffWViolationPolicy, DiffCPUPolicy, IsolationPolicy +from isolating_controller.isolation.policies import GreedyDiffWViolationPolicy, DiffCPUPolicy, DiffPolicy, IsolationPolicy from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue diff --git a/isolating_controller/isolation/__init__.py b/isolating_controller/isolation/__init__.py index 104511c..999192f 100644 --- a/isolating_controller/isolation/__init__.py +++ b/isolating_controller/isolation/__init__.py @@ -8,3 +8,10 @@ class NextStep(IntEnum): WEAKEN = 2 STOP = 3 IDLE = 4 + + +class ResourceType(IntEnum): + CPU = 0 + CACHE = 1 + MEMORY = 2 + Unknown = 3 \ No newline at end of file diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index 2c119d4..5c2f734 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -4,10 +4,9 @@ from typing import Optional -from .. import NextStep +from .. import NextStep, ResourceType from ...metric_container.basic_metric import MetricDiff from ...workload import Workload -from ..policies.base_policy import ResourceType class Isolator(metaclass=ABCMeta): diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 99d0bb3..9f84f71 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -1,12 +1,11 @@ # coding: UTF-8 import logging -from typing import Optional, Dict, Set +from typing import Optional from .base_isolator import Isolator -from .. import NextStep +from .. import NextStep, ResourceType from ...utils import ResCtrl -from ...utils import NumaTopology from ...workload import Workload @@ -14,8 +13,8 @@ class CacheIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: - super().__init__(foreground_wl, background_wl, None) + def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: + super().__init__(foreground_wl, background_wl, cont_resource) self._prev_step: Optional[int] = None self._cur_step: Optional[int] = None diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 33fa15d..fa24cdc 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -2,22 +2,22 @@ import logging -from typing import Tuple, Set, Dict +from typing import Tuple, Set, Dict, Optional from .base_isolator import Isolator -from .. import NextStep +from .. import NextStep, ResourceType from ...workload import Workload from ...utils import Cgroup from ...utils import NumaTopology from ...utils import hyphen -from ..policies.base_policy import ResourceType + class CoreIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: - super().__init__(foreground_wl, background_wl, None) + def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: + super().__init__(foreground_wl, background_wl, cont_resource) self._fg_cpuset: Tuple[int] = foreground_wl.cpuset self._bg_cpuset: Tuple[int] = background_wl.cpuset @@ -128,20 +128,27 @@ def is_min_level(self) -> bool: def _enforce(self) -> None: logger = logging.getLogger(__name__) - logger.info(f'affinity of background is {hyphen.convert_to_hyphen(self._bg_cpuset)}') - logger.info(f'affinity of foreground is {hyphen.convert_to_hyphen(self._fg_cpuset)}') + logger.info(f'after enforcing : self._cur_bg_step is {self._cur_bg_step}') + logger.info(f'after enforcing : self._cur_fg_step is {self._cur_fg_step}') + logger.info(f'after enforcing : affinity of background is {hyphen.convert_to_hyphen(self._bg_cpuset)}') + logger.info(f'after enforcing : affinity of foreground is {hyphen.convert_to_hyphen(self._fg_cpuset)}') self._bg_cgroup.assign_cpus(set(self._bg_cpuset)) self._fg_cgroup.assign_cpus(set(self._fg_cpuset)) def _first_decision(self) -> NextStep: + curr_diff = None metric_diff = self._foreground_wl.calc_metric_diff() - curr_diff = metric_diff.local_mem_util_ps + + if self._contentious_resource == ResourceType.MEMORY: + curr_diff = metric_diff.local_mem_util_ps + elif self._contentious_resource == ResourceType.CPU: + curr_diff = metric_diff.ipc logger = logging.getLogger(__name__) logger.debug(f'current diff: {curr_diff:>7.4f}') - ## FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) + # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.ipc) fg_weaken_cond = self.fg_weaken_cond(metric_diff.ipc) if curr_diff < 0: @@ -170,7 +177,8 @@ def _monitoring_result(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() curr_diff = None diff_of_diff = None - + logger = logging.getLogger(__name__) + logger.info(f'self._contentious_resource: {self._contentious_resource.name}') if self._contentious_resource == ResourceType.MEMORY: curr_diff = metric_diff.local_mem_util_ps prev_diff = self._prev_metric_diff.local_mem_util_ps @@ -193,45 +201,56 @@ def _monitoring_result(self) -> NextStep: logger.info(f'self.fg_strengthen_cond: {fg_strengthen_cond}') logger.info(f'self.fg_weaken_cond: {fg_weaken_cond}') - # FIXME: Assumption about fg's cpuset IDs are smaller than bg's ones. (kind of hard coded) - max_bg_cpuid = max(self._cpu_topo[self._background_wl.socket_id]) - min_bg_cpuid = max(self._fg_cpuset)+1 - # Case1 : diff is too small to perform isolation if abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: self._bg_next_step = NextStep.STOP - #self._fg_next_step = NextStep.STOP # This line depends on bg status + # self._fg_next_step = NextStep.STOP # This line depends on bg status return NextStep.STOP # Case2 : FG shows lower contention than solo-run -> Slower FG or Faster BG elif curr_diff > 0: self._bg_next_step = NextStep.WEAKEN - if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid): + if self.bg_outside_boundary(): self._bg_next_step = NextStep.STOP - if fg_strengthen_cond: + if fg_strengthen_cond is True: self._fg_next_step = NextStep.STRENGTHEN + elif fg_strengthen_cond is False: + self._fg_next_step = NextStep.STOP return NextStep.WEAKEN # Case3 : FG shows higher contention than solo-run else: self._bg_next_step = NextStep.STRENGTHEN - if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid): + if self.bg_outside_boundary(): self._bg_next_step = NextStep.STOP if fg_weaken_cond: self._fg_next_step = NextStep.WEAKEN + elif fg_weaken_cond is False: + self._fg_next_step = NextStep.STOP return NextStep.STRENGTHEN - @staticmethod - def fg_strengthen_cond(fg_ipc_diff) -> bool: - if fg_ipc_diff > 0: + def bg_outside_boundary(self) -> bool: + # FIXME: Assumption about fg's cpuset IDs are smaller than bg's ones. (kind of hard coded) + max_bg_cpuid = max(self._cpu_topo[self._background_wl.socket_id]) + min_bg_cpuid = max(self._fg_cpuset)+1 + if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid): return True else: return False - @staticmethod - def fg_weaken_cond(fg_ipc_diff) -> bool: - if fg_ipc_diff <= 0: + def fg_strengthen_cond(self, fg_ipc_diff) -> bool: + min_skt_cpuid = min(self._cpu_topo[self._foreground_wl.socket_id]) + if fg_ipc_diff > 0 and self._cur_fg_step > min_skt_cpuid: return True else: return False + + def fg_weaken_cond(self, fg_ipc_diff) -> bool: + if fg_ipc_diff <= 0: + free_cpu = self._cur_bg_step - self._cur_fg_step + if (free_cpu > 0 and self._bg_next_step != NextStep.WEAKEN) \ + or (free_cpu == 0 and self._bg_next_step == NextStep.STOP): + return True + else: + return False diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index 3a2e156..107cacb 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -3,8 +3,10 @@ import logging from itertools import chain +from typing import Optional + from .base_isolator import Isolator -from .. import NextStep +from .. import NextStep, ResourceType from ...utils import DVFS from ...workload import Workload @@ -13,8 +15,8 @@ class MemoryIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: - super().__init__(foreground_wl, background_wl) + def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: + super().__init__(foreground_wl, background_wl, cont_resource) self._bg_affinity = background_wl.cpuset diff --git a/isolating_controller/isolation/isolators/swap.py b/isolating_controller/isolation/isolators/swap.py index bedc04e..5857dcf 100644 --- a/isolating_controller/isolation/isolators/swap.py +++ b/isolating_controller/isolation/isolators/swap.py @@ -9,6 +9,7 @@ from ...workload import Workload from ..policies import IsolationPolicy + class SwapIsolator(Isolator): _THRESHOLD = 0.005 diff --git a/isolating_controller/isolation/policies/__init__.py b/isolating_controller/isolation/policies/__init__.py index bde7236..5e517fa 100644 --- a/isolating_controller/isolation/policies/__init__.py +++ b/isolating_controller/isolation/policies/__init__.py @@ -1,6 +1,6 @@ # coding: UTF-8 -from .base_policy import IsolationPolicy, ResourceType +from .base_policy import IsolationPolicy from .diff_policy import DiffPolicy from .diff_policy_cpu import DiffCPUPolicy from .diff_with_violation_policy import DiffWViolationPolicy diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 1259ded..b5c2e39 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -1,24 +1,18 @@ # coding: UTF-8 import logging from abc import ABCMeta, abstractmethod -from enum import IntEnum from typing import Mapping, Type -from isolating_controller.metric_container.basic_metric import MetricDiff +from isolating_controller.metric_container.basic_metric import MetricDiff, BasicMetric from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, CoreIsolator from ...workload import Workload - - -class ResourceType(IntEnum): - CPU = 0 - CACHE = 1 - MEMORY = 2 +from .. import ResourceType class IsolationPolicy(metaclass=ABCMeta): _IDLE_ISOLATOR: IdleIsolator = IdleIsolator() # FIXME : _CPU_THRESHOLD needs test - _CPU_THRESHOLD = 0.01 + _CPU_THRESHOLD = 0.1 def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: self._fg_wl = fg_wl @@ -34,11 +28,12 @@ def __hash__(self) -> int: def __repr__(self) -> str: return f'{self.__class__.__name__} ' + # FIXME: If you use policy without CPUIso., then changing ResourceType.Unknown to ResourceType.Memory def init_isolators(self) -> None: self._isolator_map = dict(( - (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), - (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), - (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)) + (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl, ResourceType.CACHE)), + (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl, ResourceType.MEMORY)), + (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl, ResourceType.Unknown)) )) @property @@ -52,11 +47,13 @@ def choose_next_isolator(self) -> bool: def contentious_resource(self) -> ResourceType: metric_diff: MetricDiff = self._fg_wl.calc_metric_diff() + cur_metric: BasicMetric = self._fg_wl.metrics[0] logger = logging.getLogger(__name__) logger.info(repr(metric_diff)) - if abs(metric_diff.local_mem_util_ps) < IsolationPolicy._CPU_THRESHOLD \ - and abs(metric_diff.l3_hit_ratio) < IsolationPolicy._CPU_THRESHOLD: + logger.info(f'l3_int: {cur_metric.l3_intensity}, mem_int: {cur_metric.mem_intensity}') + if abs(cur_metric.l3_intensity) < IsolationPolicy._CPU_THRESHOLD \ + and abs(cur_metric.mem_intensity) < IsolationPolicy._CPU_THRESHOLD: return ResourceType.CPU if metric_diff.local_mem_util_ps > 0 and metric_diff.l3_hit_ratio > 0: diff --git a/isolating_controller/isolation/policies/diff_policy.py b/isolating_controller/isolation/policies/diff_policy.py index 84ae81f..cef9e77 100644 --- a/isolating_controller/isolation/policies/diff_policy.py +++ b/isolating_controller/isolation/policies/diff_policy.py @@ -2,7 +2,8 @@ import logging -from .base_policy import IsolationPolicy, ResourceType +from .. import ResourceType +from .base_policy import IsolationPolicy from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload diff --git a/isolating_controller/isolation/policies/diff_policy_cpu.py b/isolating_controller/isolation/policies/diff_policy_cpu.py index bf904bb..99be013 100644 --- a/isolating_controller/isolation/policies/diff_policy_cpu.py +++ b/isolating_controller/isolation/policies/diff_policy_cpu.py @@ -2,7 +2,8 @@ import logging -from .base_policy import IsolationPolicy, ResourceType +from .. import ResourceType +from .base_policy import IsolationPolicy from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload diff --git a/isolating_controller/isolation/policies/diff_with_violation_policy.py b/isolating_controller/isolation/policies/diff_with_violation_policy.py index 8df9003..c50b98d 100644 --- a/isolating_controller/isolation/policies/diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/diff_with_violation_policy.py @@ -2,7 +2,7 @@ import logging -from .base_policy import ResourceType +from .. import ResourceType from .diff_policy import DiffPolicy from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index f6f801c..da37000 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -2,7 +2,8 @@ import logging -from .base_policy import IsolationPolicy, ResourceType +from .. import ResourceType +from .base_policy import IsolationPolicy from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 980d178..ce7f3f2 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -2,7 +2,7 @@ import logging -from .base_policy import ResourceType +from .. import ResourceType from .greedy_diff_policy import GreedyDiffPolicy from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator from ...workload import Workload diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 371184f..8dc8fd0 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -2,6 +2,8 @@ from time import localtime, strftime +LLC_SIZE: float = 41943040 + class BasicMetric: def __init__(self, l2miss, l3miss, inst, cycles, stall_cycles, wall_cycles, intra_coh, inter_coh, llc_size, @@ -94,10 +96,18 @@ def l3miss_ratio(self): def l3hit_ratio(self) -> float: return 1 - self._l3miss / self._l2miss + @property + def llc_util(self) -> float: + return self._llc_size/LLC_SIZE + @property def l3_intensity(self): l3_hit_ratio = 1 - self.l3miss_ratio - return self._llc_size * l3_hit_ratio + return self.llc_util * l3_hit_ratio + + @property + def mem_intensity(self): + return self.llc_util * self.l3miss_ratio def __str__(self): return ', '.join(map(str, ( @@ -132,5 +142,6 @@ def ipc(self): return self._ipc def __repr__(self) -> str: - return f'L3 hit ratio diff: {self._l3_hit_ratio:>6.03f}, Local Memory access diff: {self._local_mem_ps:>6.03f},' \ - f'IPC diff: {self.ipc:>06.03f}' + return f'L3 hit ratio diff: {self._l3_hit_ratio:>6.03f}, ' \ + f'Local Memory access diff: {self._local_mem_ps:>6.03f}, ' \ + f'IPC diff: {self.ipc:>6.03f}' diff --git a/pending_queue.py b/pending_queue.py index 88d2fe9..6c2fad0 100644 --- a/pending_queue.py +++ b/pending_queue.py @@ -1,7 +1,6 @@ # coding: UTF-8 import logging -from threading import RLock from typing import Dict, List, Sized, Type From 62aae609218a5195204dcabe1323752fb60990aa Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 25 Sep 2018 14:53:06 +0900 Subject: [PATCH 15/82] refactor codes --- controller.py | 12 +- .../isolation/isolators/base_isolator.py | 10 +- .../isolation/isolators/cache.py | 79 +++++-------- .../isolation/isolators/core.py | 106 +++++++++--------- .../isolation/isolators/idle.py | 3 + .../isolation/isolators/memory.py | 6 +- .../isolation/isolators/swap.py | 24 ++-- .../isolation/policies/base_policy.py | 3 +- .../isolation/policies/diff_policy.py | 10 +- .../isolation/policies/diff_policy_cpu.py | 6 +- .../policies/diff_with_violation_policy.py | 6 +- .../isolation/policies/greedy_diff_policy.py | 6 +- .../greedy_diff_with_violation_policy.py | 6 +- isolating_controller/utils/__init__.py | 4 +- isolating_controller/utils/cgroup.py | 15 +-- isolating_controller/utils/cgroup/__init__.py | 5 + isolating_controller/utils/cgroup/base.py | 32 ++++++ isolating_controller/utils/cgroup/cpu.py | 14 +++ isolating_controller/utils/cgroup/cpuset.py | 19 ++++ isolating_controller/utils/numa_topology.py | 72 ++++++------ isolating_controller/utils/resctrl.py | 9 +- isolating_controller/workload.py | 36 +++--- pending_queue.py | 93 ++++----------- 23 files changed, 286 insertions(+), 290 deletions(-) create mode 100644 isolating_controller/utils/cgroup/__init__.py create mode 100644 isolating_controller/utils/cgroup/base.py create mode 100644 isolating_controller/utils/cgroup/cpu.py create mode 100644 isolating_controller/utils/cgroup/cpuset.py diff --git a/controller.py b/controller.py index 1497d7d..ba9c5d1 100755 --- a/controller.py +++ b/controller.py @@ -20,11 +20,10 @@ import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator -from isolating_controller.isolation.policies import GreedyDiffWViolationPolicy, DiffCPUPolicy, DiffPolicy, IsolationPolicy +from isolating_controller.isolation.policies import DiffCPUPolicy, DiffPolicy, IsolationPolicy from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue -from threading import RLock MIN_PYTHON = (3, 6) @@ -45,10 +44,9 @@ def __init__(self, metric_buf_size: int) -> None: self._rmq_host = 'localhost' self._rmq_creation_queue = 'workload_creation' - ## FIXME : Hard coded - PendingQueue can have four workloads at most (second argument) + # FIXME : Hard coded - PendingQueue can have four workloads at most (second argument) self._pending_wl = PendingQueue(DiffCPUPolicy, 2) self._control_thread = ControlThread(self._pending_wl) - self._lock = RLock() def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: ch.basic_ack(method.delivery_tag) @@ -74,10 +72,10 @@ def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicP workload = Workload(wl_name, wl_type, pid, perf_pid, perf_interval) if wl_type == 'bg': logger.info(f'{workload} is background process') - self._pending_wl.add_bg(workload) else: logger.info(f'{workload} is foreground process') - self._pending_wl.add_fg(workload) + + self._pending_wl.add(workload) logger.info(f'{workload} is created') @@ -145,7 +143,7 @@ def __init__(self, pending_queue: PendingQueue) -> None: def _isolate_workloads(self) -> None: logger = logging.getLogger(__name__) - ## TODO: Swapper may come here + # TODO: Swapper may come here for group, iteration_num in self._isolation_groups.items(): logger.info('') diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index 5c2f734..5ff6842 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -1,7 +1,6 @@ # coding: UTF-8 from abc import ABCMeta, abstractmethod - from typing import Optional from .. import NextStep, ResourceType @@ -20,8 +19,12 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resour self._bg_next_step = NextStep.IDLE self._is_first_decision: bool = True + # FIXME: is it necessary? self._contentious_resource: Optional[ResourceType] = cont_resource + def __del__(self): + self.reset() + @abstractmethod def strengthen(self) -> 'Isolator': """ @@ -86,3 +89,8 @@ def decide_next_step(self) -> NextStep: else: return self._monitoring_result() + + @abstractmethod + def reset(self) -> None: + """Restore to initial configuration""" + pass diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 9f84f71..1c63694 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -4,8 +4,8 @@ from typing import Optional from .base_isolator import Isolator -from .. import NextStep, ResourceType -from ...utils import ResCtrl +from .. import NextStep +from ...utils import ResCtrl, numa_topology from ...workload import Workload @@ -13,33 +13,14 @@ class CacheIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: - super().__init__(foreground_wl, background_wl, cont_resource) + def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: + super().__init__(foreground_wl, background_wl) self._prev_step: Optional[int] = None self._cur_step: Optional[int] = None - self._fg_grp_name = f'{foreground_wl.name}_{foreground_wl.pid}' - self._bg_grp_name = f'{background_wl.name}_{background_wl.pid}' - - self._fg_resctrl = ResCtrl(self._fg_grp_name) - self._bg_resctrl = ResCtrl(self._bg_grp_name) - - def __del__(self) -> None: - logger = logging.getLogger(__name__) - - max_bits = ResCtrl.MAX_BITS - max_mask = ResCtrl.gen_mask(0, max_bits) - - if self._foreground_wl.is_running: - logger.debug(f'reset resctrl configuration of {self._foreground_wl}') - # FIXME: The number of socket is two at most - ResCtrl.assign_llc(self._fg_resctrl, max_mask, max_mask) - - if self._background_wl.is_running: - logger.debug(f'reset resctrl configuration of {self._background_wl}') - # FIXME: The number of socket is two at most - ResCtrl.assign_llc(self._bg_resctrl, max_mask, max_mask) + self._fg_resctrl = ResCtrl(f'{foreground_wl.name}_{foreground_wl.pid}') + self._bg_resctrl = ResCtrl(f'{background_wl.name}_{background_wl.pid}') def strengthen(self) -> 'CacheIsolator': self._prev_step = self._cur_step @@ -75,40 +56,22 @@ def is_min_level(self) -> bool: def _enforce(self) -> None: logger = logging.getLogger(__name__) - bg_socket_id = self._background_wl.socket_id - fg_socket_id = self._foreground_wl.socket_id - if self._cur_step is None: logger.info('CAT off') - - # FIXME: The number of socket is two at most - mask = ResCtrl.gen_mask(0, ResCtrl.MAX_BITS) - if bg_socket_id == 0: - ResCtrl.assign_llc(self._bg_resctrl, mask, '1') - if bg_socket_id == 1: - ResCtrl.assign_llc(self._bg_resctrl, '1', mask) - if fg_socket_id == 0: - ResCtrl.assign_llc(self._fg_resctrl, mask, '1') - if fg_socket_id == 1: - ResCtrl.assign_llc(self._fg_resctrl, '1', mask) + self.reset() else: logger.info(f'foreground : background = {self._cur_step} : {ResCtrl.MAX_BITS - self._cur_step}') - # FIXME: The number of socket is two at most - fg_mask = ResCtrl.gen_mask(0, self._cur_step) - if fg_socket_id == 0: - ResCtrl.assign_llc(self._fg_resctrl, fg_mask, '1') - if fg_socket_id == 1: - ResCtrl.assign_llc(self._fg_resctrl, '1', fg_mask) - - # FIXME: The number of socket is two at most - bg_mask = ResCtrl.gen_mask(self._cur_step) - if bg_socket_id == 0: - ResCtrl.assign_llc(self._bg_resctrl, bg_mask, '1') - if bg_socket_id == 1: - ResCtrl.assign_llc(self._bg_resctrl, '1', bg_mask) + # FIXME: hard coded -> The number of socket is two at most + masks = [ResCtrl.MIN_MASK, ResCtrl.MIN_MASK] + masks[self._foreground_wl.cur_socket_id()] = ResCtrl.gen_mask(self._cur_step) + self._fg_resctrl.assign_llc(*masks) + # FIXME: hard coded -> The number of socket is two at most + masks = [ResCtrl.MIN_MASK, ResCtrl.MIN_MASK] + masks[self._background_wl.cur_socket_id()] = ResCtrl.gen_mask(self._cur_step) + self._bg_resctrl.assign_llc(*masks) def _first_decision(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() @@ -159,3 +122,15 @@ def _monitoring_result(self) -> NextStep: return NextStep.STOP else: return NextStep.STRENGTHEN + + def reset(self) -> None: + masks = [ResCtrl.MIN_MASK] * (max(numa_topology.cur_online_nodes()) + 1) + + if self._background_wl.is_running: + bg_masks = masks.copy() + bg_masks[self._background_wl.cur_socket_id()] = ResCtrl.MAX_MASK + ResCtrl.assign_llc(self._bg_resctrl, *bg_masks) + + if self._foreground_wl.is_running: + masks[self._foreground_wl.cur_socket_id()] = ResCtrl.MAX_MASK + ResCtrl.assign_llc(self._fg_resctrl, *masks) diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index fa24cdc..bdaea18 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -1,60 +1,47 @@ # coding: UTF-8 import logging - -from typing import Tuple, Set, Dict, Optional +from typing import Tuple from .base_isolator import Isolator from .. import NextStep, ResourceType +from ...utils import hyphen, numa_topology +from ...utils.cgroup import CpuSet from ...workload import Workload -from ...utils import Cgroup -from ...utils import NumaTopology -from ...utils import hyphen class CoreIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: - super().__init__(foreground_wl, background_wl, cont_resource) + def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: + super().__init__(foreground_wl, background_wl) - self._fg_cpuset: Tuple[int] = foreground_wl.cpuset - self._bg_cpuset: Tuple[int] = background_wl.cpuset + self._fg_cpuset: Tuple[int, ...] = foreground_wl.cpuset + self._bg_cpuset: Tuple[int, ...] = background_wl.cpuset self._cur_bg_step: int = min(self._bg_cpuset) self._cur_fg_step: int = max(self._fg_cpuset) - self._fg_grp_name: str = f'{foreground_wl.name}_{foreground_wl.pid}' - self._bg_grp_name: str = f'{background_wl.name}_{background_wl.pid}' - - self._prev_fg_affinity: Tuple[int] = foreground_wl.cpuset - self._prev_bg_affinity: Tuple[int] = background_wl.cpuset - - self._fg_cgroup = Cgroup(self._fg_grp_name, 'cpuset,cpu') - self._bg_cgroup = Cgroup(self._bg_grp_name, 'cpuset,cpu') + fg_grp_name: str = f'{foreground_wl.name}_{foreground_wl.pid}' + bg_grp_name: str = f'{background_wl.name}_{background_wl.pid}' - cpu_topo, mem_topo = NumaTopology.get_numa_info() - self._cpu_topo: Dict[int, Set[int]] = cpu_topo - self._mem_topo: Set[int] = mem_topo + self._prev_fg_affinity: Tuple[int, ...] = foreground_wl.cpuset + self._prev_bg_affinity: Tuple[int, ...] = background_wl.cpuset - def __del__(self) -> None: - if self._background_wl.is_running: - self._bg_cgroup.assign_cpus(set(self._prev_bg_affinity)) - if self._foreground_wl.is_running: - self._fg_cgroup.assign_cpus(set(self._prev_fg_affinity)) + self._fg_cgroup = CpuSet(fg_grp_name) + self._bg_cgroup = CpuSet(bg_grp_name) def strengthen(self) -> 'CoreIsolator': """ Strengthen reduces the number of CPUs assigned to BG workloads and increase that of FG workload TODO: Changing step size, if needed - :return: """ # NOTE: Caller is assumed that BG workload logger = logging.getLogger(__name__) - logger.info(f'self._cur_bg_step: {self._cur_bg_step}') - logger.info(f'self._cur_fg_step: {self._cur_fg_step}') - logger.info(f'self._bg_next_step: {self._bg_next_step.name}') - logger.info(f'self._fg_next_step: {self._fg_next_step.name}') + logger.debug(f'self._cur_bg_step: {self._cur_bg_step}') + logger.debug(f'self._cur_fg_step: {self._cur_fg_step}') + logger.debug(f'self._bg_next_step: {self._bg_next_step.name}') + logger.debug(f'self._fg_next_step: {self._fg_next_step.name}') if self._bg_next_step == NextStep.STRENGTHEN: bg_cpuset = set(self._bg_cpuset) @@ -72,14 +59,13 @@ def weaken(self) -> 'CoreIsolator': """ Weaken increase the number of CPUs assigned to BG workloads and decrease that of FG workload TODO: Changing step size, if needed - :return: """ # NOTE: Caller is assumed that BG workload logger = logging.getLogger(__name__) - logger.info(f'self._cur_bg_step: {self._cur_bg_step}') - logger.info(f'self._cur_fg_step: {self._cur_fg_step}') - logger.info(f'self._bg_next_step: {self._bg_next_step.name}') - logger.info(f'self._fg_next_step: {self._fg_next_step.name}') + logger.debug(f'self._cur_bg_step: {self._cur_bg_step}') + logger.debug(f'self._cur_fg_step: {self._cur_fg_step}') + logger.debug(f'self._bg_next_step: {self._bg_next_step.name}') + logger.debug(f'self._fg_next_step: {self._fg_next_step.name}') if self._bg_next_step == NextStep.WEAKEN: bg_cpuset = set(self._bg_cpuset) @@ -96,16 +82,18 @@ def weaken(self) -> 'CoreIsolator': @property def is_max_level(self) -> bool: logger = logging.getLogger(__name__) - logger.info(f'bg max cpuset: {max(self._cpu_topo[self._background_wl.socket_id])}') - logger.info(f'self._cur_bg_step: {self._cur_bg_step}') - logger.info(f'self._cur_fg_step: {self._cur_fg_step}') - logger.info(f'self._bg_next_step: {self._bg_next_step.name}') - logger.info(f'self._fg_next_step: {self._fg_next_step.name}') + logger.debug(f'bg max cpuset: {max(numa_topology.node_to_core[self._background_wl.cur_socket_id()])}') + logger.debug(f'self._cur_bg_step: {self._cur_bg_step}') + logger.debug(f'self._cur_fg_step: {self._cur_fg_step}') + logger.debug(f'self._bg_next_step: {self._bg_next_step.name}') + logger.debug(f'self._fg_next_step: {self._fg_next_step.name}') + + # FIXME: hard coded (Background can take lower cores) # FIXME: How about first condition is true but the other is false? - if self._cur_bg_step == max(self._cpu_topo[self._background_wl.socket_id]): + if self._cur_bg_step == max(numa_topology.node_to_core[self._background_wl.cur_socket_id()]): self._bg_next_step = NextStep.STOP return True - #if self._cur_fg_step == self._cur_bg_step-1: + # if self._cur_fg_step == self._cur_bg_step-1: # self._fg_next_step = NextStep.STOP else: return False @@ -113,25 +101,25 @@ def is_max_level(self) -> bool: @property def is_min_level(self) -> bool: logger = logging.getLogger(__name__) - logger.info(f'self._cur_bg_step: {self._cur_bg_step}') - logger.info(f'self._cur_fg_step: {self._cur_fg_step}') - logger.info(f'self._bg_next_step: {self._bg_next_step.name}') - logger.info(f'self._fg_next_step: {self._fg_next_step.name}') + logger.debug(f'self._cur_bg_step: {self._cur_bg_step}') + logger.debug(f'self._cur_fg_step: {self._cur_fg_step}') + logger.debug(f'self._bg_next_step: {self._bg_next_step.name}') + logger.debug(f'self._fg_next_step: {self._fg_next_step.name}') # FIXME: How about first condition is true but the other is false? - if self._cur_bg_step == self._cur_fg_step+1: + if self._cur_bg_step == self._cur_fg_step + 1: return True - #if self._cur_fg_step == min(self._cpu_topo[self._foreground_wl.socket_id]): + # if self._cur_fg_step == min(self._cpu_topo[self._foreground_wl.socket_id]): # return True else: return False def _enforce(self) -> None: logger = logging.getLogger(__name__) - logger.info(f'after enforcing : self._cur_bg_step is {self._cur_bg_step}') - logger.info(f'after enforcing : self._cur_fg_step is {self._cur_fg_step}') - logger.info(f'after enforcing : affinity of background is {hyphen.convert_to_hyphen(self._bg_cpuset)}') - logger.info(f'after enforcing : affinity of foreground is {hyphen.convert_to_hyphen(self._fg_cpuset)}') + logger.debug(f'after enforcing : self._cur_bg_step is {self._cur_bg_step}') + logger.debug(f'after enforcing : self._cur_fg_step is {self._cur_fg_step}') + logger.debug(f'after enforcing : affinity of background is {hyphen.convert_to_hyphen(self._bg_cpuset)}') + logger.debug(f'after enforcing : affinity of foreground is {hyphen.convert_to_hyphen(self._fg_cpuset)}') self._bg_cgroup.assign_cpus(set(self._bg_cpuset)) self._fg_cgroup.assign_cpus(set(self._fg_cpuset)) @@ -203,7 +191,7 @@ def _monitoring_result(self) -> NextStep: # Case1 : diff is too small to perform isolation if abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ - or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: + or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: self._bg_next_step = NextStep.STOP # self._fg_next_step = NextStep.STOP # This line depends on bg status return NextStep.STOP @@ -232,15 +220,15 @@ def _monitoring_result(self) -> NextStep: def bg_outside_boundary(self) -> bool: # FIXME: Assumption about fg's cpuset IDs are smaller than bg's ones. (kind of hard coded) - max_bg_cpuid = max(self._cpu_topo[self._background_wl.socket_id]) - min_bg_cpuid = max(self._fg_cpuset)+1 + max_bg_cpuid = max(numa_topology.node_to_core[self._background_wl.cur_socket_id()]) + min_bg_cpuid = max(self._fg_cpuset) + 1 if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid): return True else: return False def fg_strengthen_cond(self, fg_ipc_diff) -> bool: - min_skt_cpuid = min(self._cpu_topo[self._foreground_wl.socket_id]) + min_skt_cpuid = min(numa_topology.node_to_core[self._foreground_wl.cur_socket_id()]) if fg_ipc_diff > 0 and self._cur_fg_step > min_skt_cpuid: return True else: @@ -254,3 +242,9 @@ def fg_weaken_cond(self, fg_ipc_diff) -> bool: return True else: return False + + def reset(self) -> None: + if self._background_wl.is_running: + self._bg_cgroup.assign_cpus(self._prev_bg_affinity) + if self._foreground_wl.is_running: + self._fg_cgroup.assign_cpus(self._prev_fg_affinity) diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index 9a2ec15..f886f15 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -37,3 +37,6 @@ def _monitoring_result(self) -> NextStep: self._fg_next_step = NextStep.IDLE self._bg_next_step = NextStep.IDLE return NextStep.IDLE + + def reset(self) -> None: + pass diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index 107cacb..1bdb6d3 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -23,9 +23,6 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resour # FIXME: hard coded self._cur_step = DVFS.MAX - def __del__(self) -> None: - DVFS.set_freq(DVFS.MAX, chain(self._bg_affinity)) - def strengthen(self) -> 'MemoryIsolator': self._cur_step -= DVFS.STEP return self @@ -97,3 +94,6 @@ def _monitoring_result(self) -> NextStep: return NextStep.STOP else: return NextStep.STRENGTHEN + + def reset(self) -> None: + DVFS.set_freq(DVFS.MAX, chain(self._bg_affinity)) diff --git a/isolating_controller/isolation/isolators/swap.py b/isolating_controller/isolation/isolators/swap.py index 5857dcf..b7fd82c 100644 --- a/isolating_controller/isolation/isolators/swap.py +++ b/isolating_controller/isolation/isolators/swap.py @@ -1,13 +1,13 @@ # coding: UTF-8 import logging - from typing import Dict, Set from .base_isolator import Isolator from .. import NextStep -from ...workload import Workload from ..policies import IsolationPolicy +from ...workload import Workload + class SwapIsolator(Isolator): @@ -15,7 +15,7 @@ class SwapIsolator(Isolator): def __init__(self, foreground_wl: Workload, background_wl: Workload, isolation_groups: Dict[IsolationPolicy, int]) -> None: - super().__init__(foreground_wl, background_wl, None) + super().__init__(foreground_wl, background_wl) self._all_groups = isolation_groups self._swap_candidates: Set[Workload] = None @@ -30,7 +30,6 @@ def __del__(self): if self._background_wl.is_running: logger.debug(f'reset swap configuration of {self._background_wl}') - def strengthen(self) -> 'SwapIsolator': """ Choosing which contentious workloads to swap out to other socket @@ -50,8 +49,7 @@ def is_max_level(self) -> bool: :return: """ # FIXME: hard coded - return self._swap_candidates == None - + return self._swap_candidates is None @property def is_min_level(self) -> bool: @@ -61,8 +59,7 @@ def is_min_level(self) -> bool: :return: """ # FIXME: hard coded - return self._swap_candidates == None - + return self._swap_candidates is None def weaken(self) -> 'SwapIsolator': """ @@ -80,10 +77,10 @@ def _enforce(self) -> None: """ pass -# def enforce(self) -> None: -# self._prev_metric_diff: MetricDiff = self._foreground_wl.calc_metric_diff() -# -# self._enforce() + # def enforce(self) -> None: + # self._prev_metric_diff: MetricDiff = self._foreground_wl.calc_metric_diff() + # + # self._enforce() def _first_decision(self) -> NextStep: """ @@ -98,3 +95,6 @@ def _monitoring_result(self) -> NextStep: :return: """ pass + + def reset(self) -> None: + pass diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index b5c2e39..858dba2 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -14,10 +14,9 @@ class IsolationPolicy(metaclass=ABCMeta): # FIXME : _CPU_THRESHOLD needs test _CPU_THRESHOLD = 0.1 - def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: + def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._fg_wl = fg_wl self._bg_wl = bg_wl - self._skt_id = skt_id self._isolator_map: Mapping[Type[Isolator], Isolator] = dict() self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR diff --git a/isolating_controller/isolation/policies/diff_policy.py b/isolating_controller/isolation/policies/diff_policy.py index cef9e77..3a615b1 100644 --- a/isolating_controller/isolation/policies/diff_policy.py +++ b/isolating_controller/isolation/policies/diff_policy.py @@ -4,13 +4,13 @@ from .. import ResourceType from .base_policy import IsolationPolicy -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload class DiffPolicy(IsolationPolicy): - def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: - super().__init__(fg_wl, bg_wl, skt_id) + def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: + super().__init__(fg_wl, bg_wl) self._is_llc_isolated = False self._is_mem_isolated = False @@ -31,6 +31,10 @@ def choose_next_isolator(self) -> bool: resource: ResourceType = self.contentious_resource() + if self._is_sched_isolated and self._is_mem_isolated and self._is_llc_isolated: + self._clear_flags() + logger.debug('****All isolators are applicable for now!****') + if not self._is_llc_isolated and resource is ResourceType.CACHE: self._cur_isolator = self._isolator_map[CacheIsolator] self._is_llc_isolated = True diff --git a/isolating_controller/isolation/policies/diff_policy_cpu.py b/isolating_controller/isolation/policies/diff_policy_cpu.py index 99be013..e61893e 100644 --- a/isolating_controller/isolation/policies/diff_policy_cpu.py +++ b/isolating_controller/isolation/policies/diff_policy_cpu.py @@ -4,13 +4,13 @@ from .. import ResourceType from .base_policy import IsolationPolicy -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload class DiffCPUPolicy(IsolationPolicy): - def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: - super().__init__(fg_wl, bg_wl, skt_id) + def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: + super().__init__(fg_wl, bg_wl) self._is_llc_isolated = False self._is_mem_isolated = False diff --git a/isolating_controller/isolation/policies/diff_with_violation_policy.py b/isolating_controller/isolation/policies/diff_with_violation_policy.py index c50b98d..5649c4c 100644 --- a/isolating_controller/isolation/policies/diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/diff_with_violation_policy.py @@ -4,15 +4,15 @@ from .. import ResourceType from .diff_policy import DiffPolicy -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload class DiffWViolationPolicy(DiffPolicy): VIOLATION_THRESHOLD = 3 - def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: - super().__init__(fg_wl, bg_wl, skt_id) + def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: + super().__init__(fg_wl, bg_wl) self._violation_count: int = 0 diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index da37000..7f557d2 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -4,13 +4,13 @@ from .. import ResourceType from .base_policy import IsolationPolicy -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload class GreedyDiffPolicy(IsolationPolicy): - def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: - super().__init__(fg_wl, bg_wl, skt_id) + def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: + super().__init__(fg_wl, bg_wl) self._is_mem_isolated = False diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index ce7f3f2..8e438f0 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -4,15 +4,15 @@ from .. import ResourceType from .greedy_diff_policy import GreedyDiffPolicy -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, CoreIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload class GreedyDiffWViolationPolicy(GreedyDiffPolicy): VIOLATION_THRESHOLD = 3 - def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: - super().__init__(fg_wl, bg_wl, skt_id) + def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: + super().__init__(fg_wl, bg_wl) self._violation_count: int = 0 diff --git a/isolating_controller/utils/__init__.py b/isolating_controller/utils/__init__.py index 9902838..9526532 100644 --- a/isolating_controller/utils/__init__.py +++ b/isolating_controller/utils/__init__.py @@ -1,7 +1,5 @@ # coding: UTF-8 from .cat import CAT -from .resctrl import ResCtrl -from .cgroup import Cgroup from .dvfs import DVFS -from .numa_topology import NumaTopology +from .resctrl import ResCtrl diff --git a/isolating_controller/utils/cgroup.py b/isolating_controller/utils/cgroup.py index 2f690fd..c5bc82d 100644 --- a/isolating_controller/utils/cgroup.py +++ b/isolating_controller/utils/cgroup.py @@ -1,15 +1,16 @@ # coding: UTF-8 -import subprocess import getpass import grp import os +import subprocess +from typing import Iterable, Optional, Set -from typing import Iterable, Set, Optional from .hyphen import convert_to_set +# TODO: delete class Cgroup: CPUSET_MOUNT_POINT = '/sys/fs/cgroup/cpuset' CPU_MOUNT_POINT = '/sys/fs/cgroup/cpu' @@ -25,10 +26,10 @@ def create_group(self) -> None: gname: str = grp.getgrgid(gid).gr_name subprocess.check_call(args=( - 'sudo', 'cgcreate', '-a', f'{uname}:{gname}', '-d', '700', '-f', - '600', '-t', f'{uname}:{gname}', '-s', '600', '-g', self._group_path)) + 'sudo', 'cgcreate', '-a', f'{uname}:{gname}', '-d', '700', '-f', + '600', '-t', f'{uname}:{gname}', '-s', '600', '-g', self._group_path)) - def assign_cpus(self, core_set: Set[int]) -> None: + def assign_cpus(self, core_set: Iterable[int]) -> None: core_ids = ','.join(map(str, core_set)) subprocess.check_call(args=('cgset', '-r', f'cpuset.cpus={core_ids}', self._group_name)) @@ -42,14 +43,14 @@ def _get_cpu_affinity_from_group(self) -> Set[int]: core_set: Set[int] = convert_to_set(line) return core_set - def limit_cpu_quota(self, limit_percentage: float, period: Optional[int]=None) -> None: + def limit_cpu_quota(self, limit_percentage: float, period: Optional[int] = None) -> None: if period is None: with open(f'{Cgroup.CPU_MOUNT_POINT}/cpu.cfs_period_us', "r") as fp: line: str = fp.readline() period = int(line) cpu_cores = self._get_cpu_affinity_from_group() - quota = int(period * limit_percentage/100 * len(cpu_cores)) + quota = int(period * limit_percentage / 100 * len(cpu_cores)) subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_quota_us={quota}', self._group_name)) subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_period_us={period}', self._group_name)) diff --git a/isolating_controller/utils/cgroup/__init__.py b/isolating_controller/utils/cgroup/__init__.py new file mode 100644 index 0000000..8e90c79 --- /dev/null +++ b/isolating_controller/utils/cgroup/__init__.py @@ -0,0 +1,5 @@ +# coding: UTF-8 + +from .base import BaseCgroup +from .cpu import Cpu +from .cpuset import CpuSet diff --git a/isolating_controller/utils/cgroup/base.py b/isolating_controller/utils/cgroup/base.py new file mode 100644 index 0000000..9c27abf --- /dev/null +++ b/isolating_controller/utils/cgroup/base.py @@ -0,0 +1,32 @@ +# coding: UTF-8 + +import getpass +import grp +import os +import subprocess +from abc import ABCMeta +from typing import Iterable + + +class BaseCgroup(metaclass=ABCMeta): + MOUNT_POINT = '/sys/fs/cgroup' + CONTROLLER = str() + + def __init__(self, group_name: str) -> None: + self._group_name: str = group_name + self._group_path: str = f'{self.CONTROLLER}:{group_name}' + + def create_group(self) -> None: + uname: str = getpass.getuser() + gid: int = os.getegid() + gname: str = grp.getgrgid(gid).gr_name + + subprocess.check_call(args=( + 'sudo', 'cgcreate', '-a', f'{uname}:{gname}', '-d', '755', '-f', + '644', '-t', f'{uname}:{gname}', '-s', '644', '-g', self._group_path)) + + def add_tasks(self, pids: Iterable[int]) -> None: + subprocess.check_call(args=('cgclassify', '-g', self._group_path, '--sticky', *map(str, pids))) + + def delete(self) -> None: + subprocess.check_call(args=('sudo', 'cgdelete', '-r', '-g', self._group_path)) diff --git a/isolating_controller/utils/cgroup/cpu.py b/isolating_controller/utils/cgroup/cpu.py new file mode 100644 index 0000000..889cdff --- /dev/null +++ b/isolating_controller/utils/cgroup/cpu.py @@ -0,0 +1,14 @@ +# coding: UTF-8 + + +import subprocess + +from .base import BaseCgroup + + +class Cpu(BaseCgroup): + CONTROLLER = 'cpu' + + def limit_cpu_quota(self, quota: int, period: int) -> None: + subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_quota_us={quota}', self._group_name)) + subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_period_us={period}', self._group_name)) diff --git a/isolating_controller/utils/cgroup/cpuset.py b/isolating_controller/utils/cgroup/cpuset.py new file mode 100644 index 0000000..adf11f5 --- /dev/null +++ b/isolating_controller/utils/cgroup/cpuset.py @@ -0,0 +1,19 @@ +# coding: UTF-8 + + +import subprocess +from typing import Iterable, Set + +from .base import BaseCgroup + + +class CpuSet(BaseCgroup): + CONTROLLER = 'cpuset' + + def assign_cpus(self, core_set: Iterable[int]) -> None: + core_ids = ','.join(map(str, core_set)) + subprocess.check_call(args=('cgset', '-r', f'cpuset.cpus={core_ids}', self._group_name)) + + def assign_mems(self, socket_set: Set[int]) -> None: + mem_ids = ','.join(map(str, socket_set)) + subprocess.check_call(args=('cgset', '-r', f'cpuset.mems={mem_ids}', self._group_name)) diff --git a/isolating_controller/utils/numa_topology.py b/isolating_controller/utils/numa_topology.py index cf78890..a3e3f2f 100644 --- a/isolating_controller/utils/numa_topology.py +++ b/isolating_controller/utils/numa_topology.py @@ -1,52 +1,58 @@ # coding: UTF-8 from pathlib import Path -from typing import Dict, Set, Tuple +from typing import Dict, Set from .hyphen import convert_to_set +_BASE_PATH: Path = Path('/sys/devices/system/node') -class NumaTopology: - BASE_PATH: Path = Path('/sys/devices/system/node') - @staticmethod - def get_node_topo() -> Set[int]: - online_path: Path = NumaTopology.BASE_PATH / 'online' +def get_mem_topo() -> Set[int]: + has_memory_path = _BASE_PATH / 'has_memory' - with open(online_path, "r") as fp: - line: str = fp.readline() - node_list = convert_to_set(line) + with has_memory_path.open() as fp: + line: str = fp.readline() + mem_topo = convert_to_set(line) - return node_list + # TODO: get_mem_topo can be enhanced by using real numa memory access latency - @staticmethod - def get_cpu_topo(node_list: Set[int]) -> Dict[int, Set[int]]: - cpu_topo: Dict[int, Set[int]] = dict() + return mem_topo - for num in node_list: - cpulist_path: Path = NumaTopology.BASE_PATH / f'node{num}/cpulist' - with open(cpulist_path, "r") as fp: - line: str = fp.readline() - cpu_topo[num] = convert_to_set(line) +def cur_online_nodes() -> Set[int]: + online_path: Path = _BASE_PATH / 'online' - return cpu_topo + with online_path.open() as fp: + line: str = fp.readline() + node_list = convert_to_set(line) - @staticmethod - def get_mem_topo() -> Set[int]: - has_memory_path = NumaTopology.BASE_PATH / 'has_memory' + return node_list - with open(has_memory_path, "r") as fp: - line: str = fp.readline() - mem_topo = convert_to_set(line) - # TODO: get_mem_topo can be enhanced by using real numa memory access latency +def core_belongs_to(socket_id: int) -> Set[int]: + cpulist_path: Path = _BASE_PATH / f'node{socket_id}/cpulist' - return mem_topo + with cpulist_path.open() as fp: + line: str = fp.readline() + return convert_to_set(line) - @staticmethod - def get_numa_info() -> Tuple[Dict[int, Set[int]], Set[int]]: - node_list = NumaTopology.get_node_topo() - cpu_topo = NumaTopology.get_cpu_topo(node_list) - mem_topo = NumaTopology.get_mem_topo() - return cpu_topo, mem_topo + +def _node_to_core() -> Dict[int, Set[int]]: + node_list = cur_online_nodes() + return dict((socket_id, core_belongs_to(socket_id) for socket_id in node_list)) + + +def _core_to_node() -> Dict[int, int]: + ret_dict: Dict[int, int] = dict() + node_list = cur_online_nodes() + + for socket_id in node_list: + for core_id in core_belongs_to(socket_id): + ret_dict[core_id] = socket_id + + return ret_dict + + +node_to_core: Dict[int, Set[int]] = _node_to_core() # key: socket id, value: corresponding core ids +core_to_node: Dict[int, int] = _core_to_node() # key: core id, value: corresponding socket id diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index d7f13ed..6ef1a26 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -1,12 +1,7 @@ # coding: UTF-8 import subprocess -import asyncio from pathlib import Path -from typing import Dict, Iterable, List, Tuple - -#import aiofiles -#from aiofiles.base import AiofilesContextManager def len_of_mask(mask: str) -> int: @@ -32,7 +27,7 @@ class ResCtrl: def __init__(self, group_name: str) -> None: self._group_name: str = group_name - self._group_path: Path = ResCtrl.MOUNT_POINT/f'{group_name}' + self._group_path: Path = ResCtrl.MOUNT_POINT / f'{group_name}' @property def group_name(self): @@ -45,7 +40,7 @@ def group_name(self, new_name): def add_task(self, pid: int) -> None: subprocess.run(args=('sudo', 'tee', str(self._group_path / 'tasks')), - input=f'{pid}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) + input=f'{pid}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) def assign_llc(self, *masks: str) -> None: masks = (f'{i}={mask}' for i, mask in enumerate(masks)) diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 805f2a1..25ddab8 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -2,15 +2,14 @@ from collections import deque from itertools import chain -from typing import Deque, Tuple, Set +from typing import Deque, Tuple import cpuinfo import psutil -from .utils.numa_topology import NumaTopology from .metric_container.basic_metric import BasicMetric, MetricDiff from .solorun_data.datas import data_map - +from .utils import numa_topology L3_SIZE = int(cpuinfo.get_cpu_info()['l3_cache_size'].split()[0]) * 1024 @@ -31,11 +30,13 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._perf_interval = perf_interval self._proc_info = psutil.Process(pid) - self._socket_id = None def __repr__(self) -> str: return f'{self._name} (pid: {self._pid})' + def __hash__(self) -> int: + return self._pid + @property def name(self) -> str: return self._name @@ -52,11 +53,6 @@ def wl_type(self) -> str: def metrics(self) -> Deque[BasicMetric]: return self._metrics - @property - def socket_id(self) -> int: - self._socket_id = self.get_socket_id() - return self._socket_id - @property def cpuset(self) -> Tuple[int, ...]: return tuple(self._proc_info.cpu_affinity()) @@ -87,16 +83,12 @@ def all_child_tid(self) -> Tuple[int, ...]: )) except psutil.NoSuchProcess: return tuple() - - def get_socket_id(self) -> int: - cpuset: Set[int] = self.cpuset - cpu_topo, _ = NumaTopology.get_numa_info() - - # FIXME: Hardcode for assumption (one workload to one socket) - for socket_id, skt_cpus in cpu_topo.items(): - #print(f'cpuset: {cpuset}, socket_id: {socket_id}, skt_cpus: {skt_cpus}') - for cpu_id in cpuset: - if cpu_id in skt_cpus: - ret = socket_id - self._socket_id = ret - return ret + + def cur_socket_id(self) -> int: + sockets = frozenset(numa_topology.core_to_node[core_id] for core_id in self.cpuset) + + # FIXME: hard coded + if len(sockets) is not 1: + raise NotImplementedError('Workload spans multiple sockets.') + else: + return next(iter(sockets)) diff --git a/pending_queue.py b/pending_queue.py index 6c2fad0..d8338a5 100644 --- a/pending_queue.py +++ b/pending_queue.py @@ -1,21 +1,19 @@ # coding: UTF-8 import logging - from typing import Dict, List, Sized, Type from isolating_controller.isolation.policies import IsolationPolicy from isolating_controller.workload import Workload -from isolating_controller.utils.numa_topology import NumaTopology + class PendingQueue(Sized): def __init__(self, policy_type: Type[IsolationPolicy], max_pending: int) -> None: self._policy_type: Type[IsolationPolicy] = policy_type self._max_pending: int = max_pending - self._cur_pending: int = 0 - self._bg_q: Dict[int, Workload] = dict() - self._fg_q: Dict[int, Workload] = dict() + self._cur_ready: int = 0 + self._ready_q: Dict[int, List[Workload]] = dict() # key: socket id, value: workloads self._pending_list: List[IsolationPolicy] = list() def __len__(self) -> int: @@ -23,80 +21,35 @@ def __len__(self) -> int: filter(lambda x: len(x.foreground_workload.metrics) > 0 and len(x.background_workload.metrics) > 0, self._pending_list))) - def add_bg(self, workload: Workload) -> None: - logger = logging.getLogger(__name__) - logger.info(f'{workload} is ready for active as Background') - logger.info(f'self._cur_pending: {self._cur_pending}') - - self._bg_q[workload.pid] = workload - self._cur_pending += 1 - if self._cur_pending == self._max_pending: - self.dump_to_pending_list() - - - def add_fg(self, workload: Workload) -> None: + def add(self, workload: Workload) -> None: logger = logging.getLogger(__name__) - logger.info(f'{workload} is ready for active as Foreground') - logger.info(f'self._cur_pending: {self._cur_pending}') + logger.debug(f'self._cur_ready: {self._cur_ready}') - self._fg_q[workload.pid] = workload - self._cur_pending += 1 - if self._cur_pending == self._max_pending: - self.dump_to_pending_list() + self._ready_q[workload.cur_socket_id()].append(workload) + self._cur_ready += 1 + if self._cur_ready == self._max_pending: + self._dump_to_pending_list() def pop(self) -> IsolationPolicy: if len(self) is 0: raise IndexError(f'{self} is empty') return self._pending_list.pop() - def dump_to_pending_list(self) -> None: + def _dump_to_pending_list(self) -> None: logger = logging.getLogger(__name__) - logger.info('Dumping workloads to pending list!') - - fg_pids = list(self._fg_q.keys()) - bg_pids = list(self._bg_q.keys()) - all_pids = list() - for i in range(len(self._fg_q)): - all_pids.append(fg_pids[i]) - for i in range(len(self._bg_q)): - all_pids.append(bg_pids[i]) - - node_list = NumaTopology.get_node_topo() - group_pids = dict() # Dict. for grouping the fg and bg - for node in node_list: - group_pids[node] = set() - - for pid in all_pids: - if pid in fg_pids: - skt_id = self._fg_q[pid].get_socket_id() - group_pids[skt_id].add(pid) - elif pid in bg_pids: - skt_id = self._bg_q[pid].get_socket_id() - group_pids[skt_id].add(pid) - - logger.info('Trying to create new groups!') - # - # Grouping pids based on their types and skt_id - for node in node_list: - node_pidset = group_pids[node] - if len(node_pidset) > 0: - pid = node_pidset.pop() - print(f'Pop {pid}!') - if pid in fg_pids: - bg_pid = node_pidset.pop() - print(f'Pop {bg_pid}!') - new_group = self._policy_type(self._fg_q[pid], self._bg_q[bg_pid], node) - self._pending_list.append(new_group) - del self._fg_q[pid] - del self._bg_q[bg_pid] - elif pid in bg_pids: - fg_pid = node_pidset.pop() - print(f'Pop {fg_pid}!') - new_group = self._policy_type(self._fg_q[fg_pid], self._bg_q[pid], node) - self._pending_list.append(new_group) - del self._fg_q[fg_pid] - del self._bg_q[pid] - return + logger.debug('Dumping workloads to pending list!') + + for socket_id, workloads in self._ready_q.items(): + # FIXME: hard coded + if len(workloads) is 2 and workloads[0].wl_type != workloads[1].wl_type: + if workloads[0].wl_type == 'fg': + fg = workloads[0] + bg = workloads[1] + else: + fg = workloads[1] + bg = workloads[0] + new_group = self._policy_type(fg, bg) + self._pending_list.append(new_group) def update_max_pending(self, new_max_pending: int): self._max_pending = new_max_pending From 974f940aeba2c18e6464f24f49ecabc3c1ecda44 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Tue, 25 Sep 2018 17:54:49 +0900 Subject: [PATCH 16/82] feat: Add swap_iso.py and related codes --- controller.py | 1 - .../isolation/policies/base_policy.py | 41 ++++++++++ isolating_controller/workload.py | 7 +- swap_iso.py | 81 +++++++++++++++++++ 4 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 swap_iso.py diff --git a/controller.py b/controller.py index 1497d7d..201c1a5 100755 --- a/controller.py +++ b/controller.py @@ -48,7 +48,6 @@ def __init__(self, metric_buf_size: int) -> None: ## FIXME : Hard coded - PendingQueue can have four workloads at most (second argument) self._pending_wl = PendingQueue(DiffCPUPolicy, 2) self._control_thread = ControlThread(self._pending_wl) - self._lock = RLock() def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: ch.basic_ack(method.delivery_tag) diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index b5c2e39..6c56413 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -22,6 +22,8 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload, skt_id: int) -> None: self._isolator_map: Mapping[Type[Isolator], Isolator] = dict() self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR + self._aggr_ipc_diff: float = None + def __hash__(self) -> int: return self._fg_wl.pid @@ -94,6 +96,45 @@ def cur_isolator(self) -> Isolator: def name(self) -> str: return f'{self._fg_wl.name}({self._fg_wl.pid})' + @property + def aggr_ipc(self) -> float: + return self._aggr_ipc_diff + + @property + def most_cont_workload(self) -> Workload: + fg_wl = self.foreground_workload + bg_wl = self.background_workload + + fg_ipc_diff = fg_wl.ipc_diff + bg_ipc_diff = bg_wl.ipc_diff + + # FIXME: Below condition is likely to fail due to too little differences between fg and bg + if fg_ipc_diff < bg_ipc_diff: + return fg_wl + else: + return bg_wl + + @property + def least_cont_workload(self) -> Workload: + fg_wl = self.foreground_workload + bg_wl = self.background_workload + + fg_ipc_diff = fg_wl.ipc_diff + bg_ipc_diff = bg_wl.ipc_diff + + # FIXME: Below condition is likely to fail due to too little differences between fg and bg + if fg_ipc_diff > bg_ipc_diff: + return fg_wl + else: + return bg_wl + + def update_aggr_ipc(self) -> None: + fg_diff = self._fg_wl.calc_metric_diff() + bg_diff = self._bg_wl.calc_metric_diff() + self._fg_wl._ipc_diff = fg_diff.ipc + self._bg_wl._ipc_diff = bg_diff.ipc + self._aggr_ipc_diff = fg_diff.ipc + bg_diff.ipc + def set_idle_isolator(self) -> None: self._cur_isolator.yield_isolation() self._cur_isolator = IsolationPolicy._IDLE_ISOLATOR diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 805f2a1..5bf658f 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -31,7 +31,8 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._perf_interval = perf_interval self._proc_info = psutil.Process(pid) - self._socket_id = None + self._socket_id: int = None + self._ipc_diff: float = None def __repr__(self) -> str: return f'{self._name} (pid: {self._pid})' @@ -73,6 +74,10 @@ def perf_interval(self): def is_running(self) -> bool: return self._proc_info.is_running() + @property + def ipc_diff(self) -> float: + return self._ipc_diff + def calc_metric_diff(self) -> MetricDiff: solorun_data = data_map[self.name] curr_metric: BasicMetric = self._metrics[0] diff --git a/swap_iso.py b/swap_iso.py new file mode 100644 index 0000000..49f2138 --- /dev/null +++ b/swap_iso.py @@ -0,0 +1,81 @@ +# coding: UTF-8 + +import logging + +from enum import IntEnum +from typing import Dict, Set, Optional + +from isolating_controller.workload import Workload +from isolating_controller.isolation.policies.base_policy import IsolationPolicy + + +class SwapNextStep(IntEnum): + OUT = 0 + IN = 1 + + +class SwapIsolator: + # FIXME: This threshold needs tests (How big diff is right for swapping workloads?) + _DIFF_THRESHOLD = 0.001 + + def __init__(self, isolation_groups: Dict[int, IsolationPolicy]) -> None: + """ + + :param isolation_groups: Dict. Key is the number of group and Value is the group itself + """ + self._all_groups = isolation_groups + self._swap_candidates: Dict[SwapNextStep, Workload] = dict() + + self._most_cont_group: Optional[IsolationPolicy] = None + self._least_cont_group: Optional[IsolationPolicy] = None + + self._most_cont_workload: Optional[Workload] = None + self._least_cont_workload: Optional[Workload] = None + + self.ipc_diffs: Dict[float, int] = dict() # key:val = aggr_ipc_diff:grp_idx + + def __del__(self): + logger = logging.getLogger(__name__) + + def update_cont_group(self) -> None: + """ + Most contentious group is the group which shows "the LOWEST aggr. ipc diff" + Least contentious group is the group which shows "the HIGHEST aggr. ipc diff" + + Assumption : Swap Isolator swaps workloads between the most cont. group and the least cont. group + """ + all_ipc_diffs = list() + + # Update Aggr. IPC Diffs of All Groups + for grp_idx, group in self._all_groups.items(): + group.update_aggr_ipc() + aggr_ipc_diff = group.aggr_ipc + all_ipc_diffs.append(aggr_ipc_diff) + self.ipc_diffs[aggr_ipc_diff] = grp_idx + + max_aggr_ipc_diff = max(all_ipc_diffs) + min_aggr_ipc_diff = min(all_ipc_diffs) + + swap_out_grp = self.ipc_diffs[max_aggr_ipc_diff] + swap_in_grp = self.ipc_diffs[min_aggr_ipc_diff] + + self._most_cont_group = swap_out_grp + self._least_cont_group = swap_in_grp + + def choose_swap_candidates(self): + swap_out_grp = self._most_cont_group + swap_in_grp = self._least_cont_group + + # FIXME: This part depends on the swap policy (Which one is selected for swapping) + swap_out_wl = swap_out_grp.most_cont_workload + swap_in_wl = swap_in_grp.most_cont_workload # It selects the bg workload in swap_in group + + self._swap_candidates[SwapNextStep.OUT] = swap_out_wl + self._swap_candidates[SwapNextStep.IN] = swap_in_wl + + def first_decision(self): + return + + def enforce(self): + return + From 70e50e5580b9f1577a378443085223694d40bca1 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Thu, 27 Sep 2018 05:17:18 +0900 Subject: [PATCH 17/82] feat: Add SwapIsolator and related code --- controller.py | 15 +++- .../isolation/policies/base_policy.py | 23 +++++ .../metric_container/basic_metric.py | 4 +- isolating_controller/utils/cgroup.py | 6 ++ swap_iso.py | 90 ++++++++++++++++--- 5 files changed, 121 insertions(+), 17 deletions(-) diff --git a/controller.py b/controller.py index 201c1a5..e7fc6d4 100755 --- a/controller.py +++ b/controller.py @@ -24,7 +24,7 @@ from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue -from threading import RLock +from swap_iso import SwapIsolator MIN_PYTHON = (3, 6) @@ -140,11 +140,14 @@ def __init__(self, pending_queue: PendingQueue) -> None: self._interval: float = 0.2 # Scheduling interval self._isolation_groups: Dict[IsolationPolicy, int] = dict() + self._all_groups: Dict[int, IsolationPolicy] = dict() + self._swapper: SwapIsolator = None def _isolate_workloads(self) -> None: logger = logging.getLogger(__name__) - ## TODO: Swapper may come here + # TODO: Swapper may come here + self._swapper.try_swap() for group, iteration_num in self._isolation_groups.items(): logger.info('') @@ -193,6 +196,12 @@ def _register_pending_workloads(self) -> None: self._isolation_groups[pending_group] = 0 pending_group.init_isolators() + # init self._all_groups if pending_group exist + if len(self._isolation_groups) > 0: + all_groups = list(self._isolation_groups.keys()) + for idx, group in enumerate(all_groups): + self._all_groups[idx] = group + def _remove_ended_groups(self) -> None: """ deletes the finished workloads(threads) from the dict. @@ -215,6 +224,8 @@ def run(self) -> None: logger = logging.getLogger(__name__) logger.info('starting isolation loop') + # Swapper init + self._swapper = SwapIsolator(self._all_groups) while True: self._remove_ended_groups() self._register_pending_workloads() diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 6c56413..b215ae1 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -128,6 +128,19 @@ def least_cont_workload(self) -> Workload: else: return bg_wl + @property + def least_mem_bw_workload(self) -> Workload: + fg_wl = self.foreground_workload + bg_wl = self.background_workload + + fg_mem_bw = fg_wl.metrics[0].local_mem_ps() + bg_mem_bw = bg_wl.metrics[0].local_mem_ps() + + if fg_mem_bw > bg_mem_bw: + return bg_wl + else: + return fg_wl + def update_aggr_ipc(self) -> None: fg_diff = self._fg_wl.calc_metric_diff() bg_diff = self._bg_wl.calc_metric_diff() @@ -135,6 +148,16 @@ def update_aggr_ipc(self) -> None: self._bg_wl._ipc_diff = bg_diff.ipc self._aggr_ipc_diff = fg_diff.ipc + bg_diff.ipc + def contention_diff(self, rtype: ResourceType) -> float: + fg_diff = self._fg_wl.calc_metric_diff() + bg_diff = self._bg_wl.calc_metric_diff() + if rtype is ResourceType.CPU: + return fg_diff.ipc + bg_diff.ipc + elif rtype is ResourceType.CACHE: + return fg_diff.l3_hit_ratio + bg_diff.l3_hit_ratio + elif rtype is ResourceType.MEMORY: + return fg_diff.local_mem_util_ps + bg_diff.local_mem_util_ps + def set_idle_isolator(self) -> None: self._cur_isolator.yield_isolation() self._cur_isolator = IsolationPolicy._IDLE_ISOLATOR diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 8dc8fd0..363c680 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -73,7 +73,7 @@ def req_date(self): return self._req_date @property - def ipc(self) -> float: + def ipc(self): return self._instructions / self._cycles @property @@ -123,7 +123,7 @@ def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: self._l3_hit_ratio = curr.l3hit_ratio - prev.l3hit_ratio self._local_mem_ps = curr.local_mem_ps() / prev.local_mem_ps() - 1 self._remote_mem_ps = curr.remote_mem_ps() / prev.remote_mem_ps() - 1 - self._ipc = curr.ipc - prev.ipc + self._ipc = curr.ipc() / prev.ipc() - 1 @property def l3_hit_ratio(self): diff --git a/isolating_controller/utils/cgroup.py b/isolating_controller/utils/cgroup.py index 2f690fd..aadbf7b 100644 --- a/isolating_controller/utils/cgroup.py +++ b/isolating_controller/utils/cgroup.py @@ -59,3 +59,9 @@ def add_tasks(self, pids: Iterable[int]) -> None: def delete(self) -> None: subprocess.check_call(args=('sudo', 'cgdelete', '-r', '-g', self._group_path)) + + def enable_memory_migrate(self) -> None: + subprocess.check_call(args=('cgset', '-r', f'cpuset.memory_migrate=1', self._group_name)) + + def disable_memory_migrate(self) -> None: + subprocess.check_call(args=('cgset', '-r', f'cpuset.memory_migrate=0', self._group_name)) \ No newline at end of file diff --git a/swap_iso.py b/swap_iso.py index 49f2138..f22e249 100644 --- a/swap_iso.py +++ b/swap_iso.py @@ -1,12 +1,15 @@ # coding: UTF-8 +import os +import signal import logging from enum import IntEnum -from typing import Dict, Set, Optional +from typing import Dict, Optional, Tuple from isolating_controller.workload import Workload from isolating_controller.isolation.policies.base_policy import IsolationPolicy +from isolating_controller.utils.cgroup import Cgroup class SwapNextStep(IntEnum): @@ -15,15 +18,16 @@ class SwapNextStep(IntEnum): class SwapIsolator: - # FIXME: This threshold needs tests (How big diff is right for swapping workloads?) - _DIFF_THRESHOLD = 0.001 + # FIXME: This threshold needs tests (How small diff is right for swapping workloads?) + # "-0.5" means the IPCs of workloads in a group drop 50% compared to solo-run + _IPC_DIFF_THRESHOLD = -0.5 def __init__(self, isolation_groups: Dict[int, IsolationPolicy]) -> None: """ - :param isolation_groups: Dict. Key is the number of group and Value is the group itself + :param isolation_groups: Dict. Key is the index of group and Value is the group itself """ - self._all_groups = isolation_groups + self._all_groups: Dict[int, IsolationPolicy] = isolation_groups self._swap_candidates: Dict[SwapNextStep, Workload] = dict() self._most_cont_group: Optional[IsolationPolicy] = None @@ -32,10 +36,11 @@ def __init__(self, isolation_groups: Dict[int, IsolationPolicy]) -> None: self._most_cont_workload: Optional[Workload] = None self._least_cont_workload: Optional[Workload] = None - self.ipc_diffs: Dict[float, int] = dict() # key:val = aggr_ipc_diff:grp_idx + self.aggr_ipc_diffs: Dict[float, int] = dict() # key:val = aggr_ipc_diff:grp_idx def __del__(self): logger = logging.getLogger(__name__) + print('SwapIsolator is closed...') def update_cont_group(self) -> None: """ @@ -51,13 +56,14 @@ def update_cont_group(self) -> None: group.update_aggr_ipc() aggr_ipc_diff = group.aggr_ipc all_ipc_diffs.append(aggr_ipc_diff) - self.ipc_diffs[aggr_ipc_diff] = grp_idx + self.aggr_ipc_diffs[aggr_ipc_diff] = grp_idx max_aggr_ipc_diff = max(all_ipc_diffs) min_aggr_ipc_diff = min(all_ipc_diffs) - swap_out_grp = self.ipc_diffs[max_aggr_ipc_diff] - swap_in_grp = self.ipc_diffs[min_aggr_ipc_diff] + # Lower ipc diff means lower performance relative to solo-run + swap_out_grp = self.aggr_ipc_diffs[min_aggr_ipc_diff] + swap_in_grp = self.aggr_ipc_diffs[max_aggr_ipc_diff] self._most_cont_group = swap_out_grp self._least_cont_group = swap_in_grp @@ -67,8 +73,9 @@ def choose_swap_candidates(self): swap_in_grp = self._least_cont_group # FIXME: This part depends on the swap policy (Which one is selected for swapping) - swap_out_wl = swap_out_grp.most_cont_workload - swap_in_wl = swap_in_grp.most_cont_workload # It selects the bg workload in swap_in group + # TODO: Need Tests for Swap Overhead + swap_out_wl = swap_out_grp.least_mem_bw_workload + swap_in_wl = swap_in_grp.least_mem_bw_workload # It selects the bg workload in swap_in group self._swap_candidates[SwapNextStep.OUT] = swap_out_wl self._swap_candidates[SwapNextStep.IN] = swap_in_wl @@ -76,6 +83,63 @@ def choose_swap_candidates(self): def first_decision(self): return - def enforce(self): - return + def swap_is_needed(self) -> bool: + #aggr_ipc_diff_list = list() + #for _, group in self._all_groups.items(): + # aggr_ipc_diff_list.append(group.aggr_ipc) + + #min_ipc_diff = min(aggr_ipc_diff_list) + #avg_min_ipc_diff = min_ipc_diff/2 + # FIXME: We used the average ipc diff value (We assume two workloads in a group at most) + avg_min_ipc_diff = self._most_cont_group.aggr_ipc/2 + + # TODO: Test the _IPC_DIFF_THRESHOLD + if avg_min_ipc_diff < self._IPC_DIFF_THRESHOLD: + return True + else: + return False + + def do_swap(self) -> None: + # Enable CPUSET memory migration + out_proc, in_proc = self.pre_swap_setup() + + out_cpuset = self._swap_candidates[SwapNextStep.OUT].cpuset + in_cpuset = self._swap_candidates[SwapNextStep.IN].cpuset + out_skt = self._swap_candidates[SwapNextStep.OUT].socket_id + in_skt = self._swap_candidates[SwapNextStep.OUT].socket_id + + # Suspend Procs and Enforce Swap Conf. + os.kill(self._swap_candidates[SwapNextStep.OUT].pid, signal.SIGSTOP) + os.kill(self._swap_candidates[SwapNextStep.IN].pid, signal.SIGSTOP) + + out_proc.assign_cpus(set(in_cpuset)) + out_proc.assign_mems(set(out_skt)) + in_proc.assign_cpus(set(out_cpuset)) + in_proc.assign_mems(set(in_skt)) + + # Resume Procs + os.kill(self._swap_candidates[SwapNextStep.OUT].pid, signal.SIGCONT) + os.kill(self._swap_candidates[SwapNextStep.IN].pid, signal.SIGCONT) + + def pre_swap_setup(self) -> Tuple[Cgroup, Cgroup]: + swap_out_workload = self._swap_candidates[SwapNextStep.OUT] + swap_in_workload = self._swap_candidates[SwapNextStep.IN] + + swap_out_grp_name = f'{swap_out_workload.name}_{swap_out_workload.pid}' + swap_in_grp_name = f'{swap_in_workload.name}_{swap_in_workload.pid}' + + out_proc = Cgroup(swap_out_grp_name, 'cpuset,cpu') + in_proc = Cgroup(swap_in_grp_name, 'cpuset,cpu') + + out_proc.enable_memory_migrate() + in_proc.enable_memory_migrate() + + return out_proc, in_proc + + def try_swap(self) -> None: + self.update_cont_group() + self.choose_swap_candidates() + if self.swap_is_needed: + self.do_swap() + From dfc1af68f39655839a2c3f90831e61a4f83d2556 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 28 Sep 2018 01:04:01 +0900 Subject: [PATCH 18/82] refactor swap_iso.py --- controller.py | 15 +--- isolating_controller/utils/cgroup/cpuset.py | 7 +- isolating_controller/workload.py | 4 + swap_iso.py | 85 +++++++++------------ 4 files changed, 46 insertions(+), 65 deletions(-) diff --git a/controller.py b/controller.py index ce9aa62..7fbb8cf 100755 --- a/controller.py +++ b/controller.py @@ -20,7 +20,7 @@ import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator -from isolating_controller.isolation.policies import DiffCPUPolicy, DiffPolicy, IsolationPolicy +from isolating_controller.isolation.policies import DiffCPUPolicy, IsolationPolicy from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue @@ -140,13 +140,12 @@ def __init__(self, pending_queue: PendingQueue) -> None: self._interval: float = 0.2 # Scheduling interval self._isolation_groups: Dict[IsolationPolicy, int] = dict() - self._all_groups: Dict[int, IsolationPolicy] = dict() - self._swapper: SwapIsolator = None + # Swapper init + self._swapper: SwapIsolator = SwapIsolator(self._isolation_groups) def _isolate_workloads(self) -> None: logger = logging.getLogger(__name__) - # TODO: Swapper may come here self._swapper.try_swap() for group, iteration_num in self._isolation_groups.items(): @@ -196,12 +195,6 @@ def _register_pending_workloads(self) -> None: self._isolation_groups[pending_group] = 0 pending_group.init_isolators() - # init self._all_groups if pending_group exist - if len(self._isolation_groups) > 0: - all_groups = list(self._isolation_groups.keys()) - for idx, group in enumerate(all_groups): - self._all_groups[idx] = group - def _remove_ended_groups(self) -> None: """ deletes the finished workloads(threads) from the dict. @@ -224,8 +217,6 @@ def run(self) -> None: logger = logging.getLogger(__name__) logger.info('starting isolation loop') - # Swapper init - self._swapper = SwapIsolator(self._all_groups) while True: self._remove_ended_groups() self._register_pending_workloads() diff --git a/isolating_controller/utils/cgroup/cpuset.py b/isolating_controller/utils/cgroup/cpuset.py index adf11f5..ee81cbd 100644 --- a/isolating_controller/utils/cgroup/cpuset.py +++ b/isolating_controller/utils/cgroup/cpuset.py @@ -2,7 +2,7 @@ import subprocess -from typing import Iterable, Set +from typing import Iterable from .base import BaseCgroup @@ -14,6 +14,9 @@ def assign_cpus(self, core_set: Iterable[int]) -> None: core_ids = ','.join(map(str, core_set)) subprocess.check_call(args=('cgset', '-r', f'cpuset.cpus={core_ids}', self._group_name)) - def assign_mems(self, socket_set: Set[int]) -> None: + def assign_mems(self, socket_set: Iterable[int]) -> None: mem_ids = ','.join(map(str, socket_set)) subprocess.check_call(args=('cgset', '-r', f'cpuset.mems={mem_ids}', self._group_name)) + + def set_memory_migrate(self, flag: bool) -> None: + subprocess.check_call(args=('cgset', '-r', f'cpuset.memory_migrate={int(flag)}', self._group_name)) diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 23513b7..8a23099 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -74,6 +74,10 @@ def is_running(self) -> bool: def ipc_diff(self) -> float: return self._ipc_diff + @property + def group_name(self) -> str: + return f'{self.name}_{self.pid}' + def calc_metric_diff(self) -> MetricDiff: solorun_data = data_map[self.name] curr_metric: BasicMetric = self._metrics[0] diff --git a/swap_iso.py b/swap_iso.py index f22e249..3e0bbb0 100644 --- a/swap_iso.py +++ b/swap_iso.py @@ -1,15 +1,14 @@ # coding: UTF-8 +import logging import os import signal -import logging - from enum import IntEnum from typing import Dict, Optional, Tuple -from isolating_controller.workload import Workload from isolating_controller.isolation.policies.base_policy import IsolationPolicy -from isolating_controller.utils.cgroup import Cgroup +from isolating_controller.utils.cgroup import CpuSet +from isolating_controller.workload import Workload class SwapNextStep(IntEnum): @@ -22,12 +21,12 @@ class SwapIsolator: # "-0.5" means the IPCs of workloads in a group drop 50% compared to solo-run _IPC_DIFF_THRESHOLD = -0.5 - def __init__(self, isolation_groups: Dict[int, IsolationPolicy]) -> None: + def __init__(self, isolation_groups: Dict[IsolationPolicy, int]) -> None: """ :param isolation_groups: Dict. Key is the index of group and Value is the group itself """ - self._all_groups: Dict[int, IsolationPolicy] = isolation_groups + self._all_groups: Dict[IsolationPolicy, int] = isolation_groups self._swap_candidates: Dict[SwapNextStep, Workload] = dict() self._most_cont_group: Optional[IsolationPolicy] = None @@ -36,11 +35,9 @@ def __init__(self, isolation_groups: Dict[int, IsolationPolicy]) -> None: self._most_cont_workload: Optional[Workload] = None self._least_cont_workload: Optional[Workload] = None - self.aggr_ipc_diffs: Dict[float, int] = dict() # key:val = aggr_ipc_diff:grp_idx - def __del__(self): logger = logging.getLogger(__name__) - print('SwapIsolator is closed...') + logger.info('SwapIsolator is closed...') def update_cont_group(self) -> None: """ @@ -49,21 +46,20 @@ def update_cont_group(self) -> None: Assumption : Swap Isolator swaps workloads between the most cont. group and the least cont. group """ - all_ipc_diffs = list() - # Update Aggr. IPC Diffs of All Groups - for grp_idx, group in self._all_groups.items(): - group.update_aggr_ipc() - aggr_ipc_diff = group.aggr_ipc - all_ipc_diffs.append(aggr_ipc_diff) - self.aggr_ipc_diffs[aggr_ipc_diff] = grp_idx + swap_in_grp: Optional[IsolationPolicy] = None + swap_out_grp: Optional[IsolationPolicy] = None - max_aggr_ipc_diff = max(all_ipc_diffs) - min_aggr_ipc_diff = min(all_ipc_diffs) + for group in self._all_groups.keys(): + if swap_in_grp is None: + swap_in_grp = group + if swap_out_grp is None: + swap_out_grp = group - # Lower ipc diff means lower performance relative to solo-run - swap_out_grp = self.aggr_ipc_diffs[min_aggr_ipc_diff] - swap_in_grp = self.aggr_ipc_diffs[max_aggr_ipc_diff] + # FIXME: replace to property + group.update_aggr_ipc() + swap_in_grp = max(swap_in_grp, group, key=lambda x: x.aggr_ipc) + swap_out_grp = min(swap_out_grp, group, key=lambda x: x.aggr_ipc) self._most_cont_group = swap_out_grp self._least_cont_group = swap_in_grp @@ -84,14 +80,8 @@ def first_decision(self): return def swap_is_needed(self) -> bool: - #aggr_ipc_diff_list = list() - #for _, group in self._all_groups.items(): - # aggr_ipc_diff_list.append(group.aggr_ipc) - - #min_ipc_diff = min(aggr_ipc_diff_list) - #avg_min_ipc_diff = min_ipc_diff/2 # FIXME: We used the average ipc diff value (We assume two workloads in a group at most) - avg_min_ipc_diff = self._most_cont_group.aggr_ipc/2 + avg_min_ipc_diff = self._most_cont_group.aggr_ipc / 2 # TODO: Test the _IPC_DIFF_THRESHOLD if avg_min_ipc_diff < self._IPC_DIFF_THRESHOLD: @@ -101,38 +91,33 @@ def swap_is_needed(self) -> bool: def do_swap(self) -> None: # Enable CPUSET memory migration - out_proc, in_proc = self.pre_swap_setup() + out_cgroup, in_cgroup = self.pre_swap_setup() - out_cpuset = self._swap_candidates[SwapNextStep.OUT].cpuset - in_cpuset = self._swap_candidates[SwapNextStep.IN].cpuset - out_skt = self._swap_candidates[SwapNextStep.OUT].socket_id - in_skt = self._swap_candidates[SwapNextStep.OUT].socket_id + out_wl = self._swap_candidates[SwapNextStep.OUT] + in_wl = self._swap_candidates[SwapNextStep.IN] # Suspend Procs and Enforce Swap Conf. - os.kill(self._swap_candidates[SwapNextStep.OUT].pid, signal.SIGSTOP) - os.kill(self._swap_candidates[SwapNextStep.IN].pid, signal.SIGSTOP) + os.kill(out_wl.pid, signal.SIGSTOP) + os.kill(in_wl.pid, signal.SIGSTOP) - out_proc.assign_cpus(set(in_cpuset)) - out_proc.assign_mems(set(out_skt)) - in_proc.assign_cpus(set(out_cpuset)) - in_proc.assign_mems(set(in_skt)) + out_cgroup.assign_cpus(in_wl.cpuset) + out_cgroup.assign_mems((in_wl.cur_socket_id(),)) + in_cgroup.assign_cpus(out_wl.cpuset) + in_cgroup.assign_mems((out_wl.cur_socket_id(),)) # Resume Procs - os.kill(self._swap_candidates[SwapNextStep.OUT].pid, signal.SIGCONT) - os.kill(self._swap_candidates[SwapNextStep.IN].pid, signal.SIGCONT) + os.kill(out_wl.pid, signal.SIGCONT) + os.kill(in_wl.pid, signal.SIGCONT) - def pre_swap_setup(self) -> Tuple[Cgroup, Cgroup]: + def pre_swap_setup(self) -> Tuple[CpuSet, CpuSet]: swap_out_workload = self._swap_candidates[SwapNextStep.OUT] swap_in_workload = self._swap_candidates[SwapNextStep.IN] - swap_out_grp_name = f'{swap_out_workload.name}_{swap_out_workload.pid}' - swap_in_grp_name = f'{swap_in_workload.name}_{swap_in_workload.pid}' + out_proc = CpuSet(swap_out_workload.group_name) + in_proc = CpuSet(swap_in_workload.group_name) - out_proc = Cgroup(swap_out_grp_name, 'cpuset,cpu') - in_proc = Cgroup(swap_in_grp_name, 'cpuset,cpu') - - out_proc.enable_memory_migrate() - in_proc.enable_memory_migrate() + out_proc.set_memory_migrate(True) + in_proc.set_memory_migrate(True) return out_proc, in_proc @@ -141,5 +126,3 @@ def try_swap(self) -> None: self.choose_swap_candidates() if self.swap_is_needed: self.do_swap() - - From f1491db59b7af07d465176ab73d272ba2f8fe545 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 28 Sep 2018 03:52:22 +0900 Subject: [PATCH 19/82] Only modified to make sched work (swapper, core needs debugging) --- controller.py | 9 +- .../isolation/isolators/__init__.py | 1 + .../isolation/isolators/base_isolator.py | 7 +- .../isolation/isolators/core.py | 29 +++-- .../isolation/isolators/memory.py | 8 +- .../isolation/isolators/schedule.py | 110 ++++++++++++++++++ .../isolation/isolators/swap.py | 100 ---------------- .../isolation/policies/base_policy.py | 33 ++---- .../isolation/policies/diff_policy.py | 8 +- .../policies/diff_with_violation_policy.py | 6 +- .../isolation/policies/greedy_diff_policy.py | 9 +- .../greedy_diff_with_violation_policy.py | 6 +- .../metric_container/basic_metric.py | 39 ++++--- isolating_controller/utils/numa_topology.py | 2 +- pending_queue.py | 52 ++++----- swap_iso.py | 6 +- 16 files changed, 206 insertions(+), 219 deletions(-) create mode 100644 isolating_controller/isolation/isolators/schedule.py delete mode 100644 isolating_controller/isolation/isolators/swap.py diff --git a/controller.py b/controller.py index 7fbb8cf..aa7f828 100755 --- a/controller.py +++ b/controller.py @@ -20,7 +20,7 @@ import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator -from isolating_controller.isolation.policies import DiffCPUPolicy, IsolationPolicy +from isolating_controller.isolation.policies import GreedyDiffWViolationPolicy, IsolationPolicy from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue @@ -45,8 +45,7 @@ def __init__(self, metric_buf_size: int) -> None: self._rmq_host = 'localhost' self._rmq_creation_queue = 'workload_creation' - # FIXME : Hard coded - PendingQueue can have four workloads at most (second argument) - self._pending_wl = PendingQueue(DiffCPUPolicy, 2) + self._pending_wl = PendingQueue(GreedyDiffWViolationPolicy) self._control_thread = ControlThread(self._pending_wl) def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: @@ -78,8 +77,6 @@ def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicP self._pending_wl.add(workload) - logger.info(f'{workload} is created') - wl_queue_name = '{}({})'.format(wl_name, pid) ch.queue_declare(wl_queue_name) ch.basic_consume(functools.partial(self._cbk_wl_monitor, workload), wl_queue_name) @@ -146,7 +143,7 @@ def __init__(self, pending_queue: PendingQueue) -> None: def _isolate_workloads(self) -> None: logger = logging.getLogger(__name__) - self._swapper.try_swap() + # self._swapper.try_swap() for group, iteration_num in self._isolation_groups.items(): logger.info('') diff --git a/isolating_controller/isolation/isolators/__init__.py b/isolating_controller/isolation/isolators/__init__.py index 634a419..3eecaf2 100644 --- a/isolating_controller/isolation/isolators/__init__.py +++ b/isolating_controller/isolation/isolators/__init__.py @@ -6,3 +6,4 @@ from .idle import IdleIsolator from .memory import MemoryIsolator from .core import CoreIsolator +from .schedule import SchedIsolator diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index 5ff6842..ecf45d6 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -1,15 +1,14 @@ # coding: UTF-8 from abc import ABCMeta, abstractmethod -from typing import Optional -from .. import NextStep, ResourceType +from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...workload import Workload class Isolator(metaclass=ABCMeta): - def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: + def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._prev_metric_diff: MetricDiff = foreground_wl.calc_metric_diff() self._foreground_wl = foreground_wl @@ -19,8 +18,6 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resour self._bg_next_step = NextStep.IDLE self._is_first_decision: bool = True - # FIXME: is it necessary? - self._contentious_resource: Optional[ResourceType] = cont_resource def __del__(self): self.reset() diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index bdaea18..b2226ea 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -1,7 +1,7 @@ # coding: UTF-8 import logging -from typing import Tuple +from typing import Optional, Tuple from .base_isolator import Isolator from .. import NextStep, ResourceType @@ -14,22 +14,19 @@ class CoreIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: - super().__init__(foreground_wl, background_wl) + def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: + super().__init__(foreground_wl, background_wl, cont_resource) self._fg_cpuset: Tuple[int, ...] = foreground_wl.cpuset self._bg_cpuset: Tuple[int, ...] = background_wl.cpuset self._cur_bg_step: int = min(self._bg_cpuset) self._cur_fg_step: int = max(self._fg_cpuset) - fg_grp_name: str = f'{foreground_wl.name}_{foreground_wl.pid}' - bg_grp_name: str = f'{background_wl.name}_{background_wl.pid}' - self._prev_fg_affinity: Tuple[int, ...] = foreground_wl.cpuset self._prev_bg_affinity: Tuple[int, ...] = background_wl.cpuset - self._fg_cgroup = CpuSet(fg_grp_name) - self._bg_cgroup = CpuSet(bg_grp_name) + self._fg_cgroup = CpuSet(foreground_wl.group_name) + self._bg_cgroup = CpuSet(background_wl.group_name) def strengthen(self) -> 'CoreIsolator': """ @@ -131,14 +128,14 @@ def _first_decision(self) -> NextStep: if self._contentious_resource == ResourceType.MEMORY: curr_diff = metric_diff.local_mem_util_ps elif self._contentious_resource == ResourceType.CPU: - curr_diff = metric_diff.ipc + curr_diff = metric_diff.instruction_ps logger = logging.getLogger(__name__) logger.debug(f'current diff: {curr_diff:>7.4f}') # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) - fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.ipc) - fg_weaken_cond = self.fg_weaken_cond(metric_diff.ipc) + fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.instruction_ps) + fg_weaken_cond = self.fg_weaken_cond(metric_diff.instruction_ps) if curr_diff < 0: if self.is_max_level: self._bg_next_step = NextStep.STOP @@ -172,8 +169,8 @@ def _monitoring_result(self) -> NextStep: prev_diff = self._prev_metric_diff.local_mem_util_ps diff_of_diff = curr_diff - prev_diff elif self._contentious_resource == ResourceType.CPU: - curr_diff = metric_diff.ipc - prev_diff = self._prev_metric_diff.ipc + curr_diff = metric_diff.instruction_ps + prev_diff = self._prev_metric_diff.instruction_ps diff_of_diff = curr_diff - prev_diff logger = logging.getLogger(__name__) @@ -181,11 +178,11 @@ def _monitoring_result(self) -> NextStep: logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) - fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.ipc) - fg_weaken_cond = self.fg_weaken_cond(metric_diff.ipc) + fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.instruction_ps) + fg_weaken_cond = self.fg_weaken_cond(metric_diff.instruction_ps) logger = logging.getLogger(__name__) - logger.info(f'metric_diff.ipc: {metric_diff.ipc}') + logger.info(f'metric_diff.instruction_ps: {metric_diff.instruction_ps}') logger.info(f'self.fg_strengthen_cond: {fg_strengthen_cond}') logger.info(f'self.fg_weaken_cond: {fg_weaken_cond}') diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index 1bdb6d3..d275b4d 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -3,10 +3,8 @@ import logging from itertools import chain -from typing import Optional - from .base_isolator import Isolator -from .. import NextStep, ResourceType +from .. import NextStep from ...utils import DVFS from ...workload import Workload @@ -15,8 +13,8 @@ class MemoryIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: - super().__init__(foreground_wl, background_wl, cont_resource) + def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: + super().__init__(foreground_wl, background_wl) self._bg_affinity = background_wl.cpuset diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py new file mode 100644 index 0000000..78e4d2a --- /dev/null +++ b/isolating_controller/isolation/isolators/schedule.py @@ -0,0 +1,110 @@ +# coding: UTF-8 + +import logging + +from .base_isolator import Isolator +from .. import NextStep +from ...utils.cgroup.cpuset import CpuSet +from ...workload import Workload + + +class SchedIsolator(Isolator): + _DOD_THRESHOLD = 0.005 + _FORCE_THRESHOLD = 0.1 + + def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: + super().__init__(foreground_wl, background_wl) + + # FIXME: hard coded + if background_wl.cur_socket_id() is 1: + self._cur_step = 24 + else: + self._cur_step = 8 + + # FIXME: hard coded + self._prev_bg_affinity = range(8, 16) if background_wl.cur_socket_id() is 0 else range(24, 32) + + self._bg_grp = CpuSet(background_wl.group_name) + + def strengthen(self) -> 'SchedIsolator': + self._cur_step += 1 + return self + + def weaken(self) -> 'SchedIsolator': + self._cur_step -= 1 + return self + + @property + def is_max_level(self) -> bool: + # FIXME: hard coded + if self._background_wl.cur_socket_id() is 1: + return self._cur_step == 31 + else: + return self._cur_step == 15 + + @property + def is_min_level(self) -> bool: + # FIXME: hard coded + if self._background_wl.cur_socket_id() is 1: + return self._cur_step == 24 + else: + return self._cur_step == 8 + + def _enforce(self) -> None: + logger = logging.getLogger(__name__) + # FIXME: hard coded + if self._background_wl.cur_socket_id() is 1: + logger.info(f'affinity of background is {self._cur_step}-31') + else: + logger.info(f'affinity of background is {self._cur_step}-15') + + # FIXME: hard coded + self._bg_grp.assign_cpus(range(self._cur_step, 32 if self._background_wl.cur_socket_id() is 1 else 16)) + + def _first_decision(self) -> NextStep: + metric_diff = self._foreground_wl.calc_metric_diff() + curr_diff = metric_diff.local_mem_util_ps + + logger = logging.getLogger(__name__) + logger.debug(f'current diff: {curr_diff:>7.4f}') + + if curr_diff < 0: + if self.is_max_level: + return NextStep.STOP + else: + return NextStep.STRENGTHEN + elif curr_diff <= SchedIsolator._FORCE_THRESHOLD: + return NextStep.STOP + else: + if self.is_min_level: + return NextStep.STOP + else: + return NextStep.WEAKEN + + def _monitoring_result(self) -> NextStep: + metric_diff = self._foreground_wl.calc_metric_diff() + + curr_diff = metric_diff.local_mem_util_ps + prev_diff = self._prev_metric_diff.local_mem_util_ps + diff_of_diff = curr_diff - prev_diff + + logger = logging.getLogger(__name__) + logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') + logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') + + # FIXME: hard coded + if (self._background_wl.cur_socket_id() is 1 and not (24 < self._cur_step < 31) or + self._background_wl.cur_socket_id() is 0 and not (8 < self._cur_step < 15)) \ + or abs(diff_of_diff) <= SchedIsolator._DOD_THRESHOLD \ + or abs(curr_diff) <= SchedIsolator._DOD_THRESHOLD: + return NextStep.STOP + + elif curr_diff > 0: + return NextStep.WEAKEN + + else: + return NextStep.STRENGTHEN + + def reset(self) -> None: + if self._background_wl.is_running: + self._bg_grp.assign_cpus(self._prev_bg_affinity) diff --git a/isolating_controller/isolation/isolators/swap.py b/isolating_controller/isolation/isolators/swap.py deleted file mode 100644 index b7fd82c..0000000 --- a/isolating_controller/isolation/isolators/swap.py +++ /dev/null @@ -1,100 +0,0 @@ -# coding: UTF-8 - -import logging -from typing import Dict, Set - -from .base_isolator import Isolator -from .. import NextStep -from ..policies import IsolationPolicy -from ...workload import Workload - - - -class SwapIsolator(Isolator): - _THRESHOLD = 0.005 - - def __init__(self, foreground_wl: Workload, background_wl: Workload, - isolation_groups: Dict[IsolationPolicy, int]) -> None: - super().__init__(foreground_wl, background_wl) - - self._all_groups = isolation_groups - self._swap_candidates: Set[Workload] = None - self._most_contentious_group = None - self._most_contentious_workload = None - - def __del__(self): - logger = logging.getLogger(__name__) - if self._foreground_wl.is_running: - logger.debug(f'reset swap configuration of {self._foreground_wl}') - - if self._background_wl.is_running: - logger.debug(f'reset swap configuration of {self._background_wl}') - - def strengthen(self) -> 'SwapIsolator': - """ - Choosing which contentious workloads to swap out to other socket - :return: - """ - # FIXME: hard coded (two sockets) - ## 1.Estimating and selecting the most contentious workloads from the socket of cur_group - ## 2. - - return self - - @property - def is_max_level(self) -> bool: - """ - Searching configuration space to the max level - e.g., There is no searchable candidate to strengthen the degree of isolation - :return: - """ - # FIXME: hard coded - return self._swap_candidates is None - - @property - def is_min_level(self) -> bool: - """ - Searching configuration space to the min level - e.g., There is no searchable candidate to weaken the degree of isolation - :return: - """ - # FIXME: hard coded - return self._swap_candidates is None - - def weaken(self) -> 'SwapIsolator': - """ - Choosing which contentious workloads to swap in from other socket - :return: - """ - # FIXME: hard coded (two sockets) - ## 1.Estimating and selecting the most contentious workloads from the socket of other_group - return self - - def _enforce(self) -> None: - """ - Enforcing the pre-configured swap isolation - :return: - """ - pass - - # def enforce(self) -> None: - # self._prev_metric_diff: MetricDiff = self._foreground_wl.calc_metric_diff() - # - # self._enforce() - - def _first_decision(self) -> NextStep: - """ - How to choose the first candidate? - :return: - """ - pass - - def _monitoring_result(self) -> NextStep: - """ - If the effect of swapping is getting worse, then rollback?? - :return: - """ - pass - - def reset(self) -> None: - pass diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 4359d0d..0e73650 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -3,10 +3,10 @@ from abc import ABCMeta, abstractmethod from typing import Mapping, Type -from isolating_controller.metric_container.basic_metric import MetricDiff, BasicMetric -from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, CoreIsolator -from ...workload import Workload from .. import ResourceType +from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, SchedIsolator +from ...metric_container.basic_metric import BasicMetric, MetricDiff +from ...workload import Workload class IsolationPolicy(metaclass=ABCMeta): @@ -32,9 +32,9 @@ def __repr__(self) -> str: # FIXME: If you use policy without CPUIso., then changing ResourceType.Unknown to ResourceType.Memory def init_isolators(self) -> None: self._isolator_map = dict(( - (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl, ResourceType.CACHE)), - (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl, ResourceType.MEMORY)), - (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl, ResourceType.Unknown)) + (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), + (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), + (SchedIsolator, SchedIsolator(self._fg_wl, self._bg_wl)) )) @property @@ -55,7 +55,7 @@ def contentious_resource(self) -> ResourceType: logger.info(f'l3_int: {cur_metric.l3_intensity}, mem_int: {cur_metric.mem_intensity}') if abs(cur_metric.l3_intensity) < IsolationPolicy._CPU_THRESHOLD \ and abs(cur_metric.mem_intensity) < IsolationPolicy._CPU_THRESHOLD: - return ResourceType.CPU + return ResourceType.CPU if metric_diff.local_mem_util_ps > 0 and metric_diff.l3_hit_ratio > 0: if metric_diff.l3_hit_ratio > metric_diff.local_mem_util_ps: @@ -140,22 +140,13 @@ def least_mem_bw_workload(self) -> Workload: else: return fg_wl - def update_aggr_ipc(self) -> None: - fg_diff = self._fg_wl.calc_metric_diff() - bg_diff = self._bg_wl.calc_metric_diff() - self._fg_wl._ipc_diff = fg_diff.ipc - self._bg_wl._ipc_diff = bg_diff.ipc - self._aggr_ipc_diff = fg_diff.ipc + bg_diff.ipc - - def contention_diff(self, rtype: ResourceType) -> float: + # FIXME: replace to property + def update_aggr_instr(self) -> None: fg_diff = self._fg_wl.calc_metric_diff() bg_diff = self._bg_wl.calc_metric_diff() - if rtype is ResourceType.CPU: - return fg_diff.ipc + bg_diff.ipc - elif rtype is ResourceType.CACHE: - return fg_diff.l3_hit_ratio + bg_diff.l3_hit_ratio - elif rtype is ResourceType.MEMORY: - return fg_diff.local_mem_util_ps + bg_diff.local_mem_util_ps + self._fg_wl._ipc_diff = fg_diff.instruction_ps + self._bg_wl._ipc_diff = bg_diff.instruction_ps + self._aggr_ipc_diff = fg_diff.instruction_ps + bg_diff.instruction_ps def set_idle_isolator(self) -> None: self._cur_isolator.yield_isolation() diff --git a/isolating_controller/isolation/policies/diff_policy.py b/isolating_controller/isolation/policies/diff_policy.py index 3a615b1..f9285b3 100644 --- a/isolating_controller/isolation/policies/diff_policy.py +++ b/isolating_controller/isolation/policies/diff_policy.py @@ -2,9 +2,9 @@ import logging -from .. import ResourceType from .base_policy import IsolationPolicy -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator +from .. import ResourceType +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload @@ -31,7 +31,7 @@ def choose_next_isolator(self) -> bool: resource: ResourceType = self.contentious_resource() - if self._is_sched_isolated and self._is_mem_isolated and self._is_llc_isolated: + if self._is_core_isolated and self._is_mem_isolated and self._is_llc_isolated: self._clear_flags() logger.debug('****All isolators are applicable for now!****') @@ -48,7 +48,7 @@ def choose_next_isolator(self) -> bool: return True elif not self._is_core_isolated and resource is ResourceType.MEMORY: - self._cur_isolator = self._isolator_map[CoreIsolator] + self._cur_isolator = self._isolator_map[SchedIsolator] self._is_core_isolated = True logger.info(f'Core Isolation for {self._fg_wl} is started') return True diff --git a/isolating_controller/isolation/policies/diff_with_violation_policy.py b/isolating_controller/isolation/policies/diff_with_violation_policy.py index 5649c4c..4b264c7 100644 --- a/isolating_controller/isolation/policies/diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/diff_with_violation_policy.py @@ -2,9 +2,9 @@ import logging -from .. import ResourceType from .diff_policy import DiffPolicy -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator +from .. import ResourceType +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload @@ -22,7 +22,7 @@ def _check_violation(self) -> bool: return \ resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, CoreIsolator)) + and not isinstance(self._cur_isolator, SchedIsolator)) @property def new_isolator_needed(self) -> bool: diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 7f557d2..b49ad49 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -2,9 +2,9 @@ import logging -from .. import ResourceType from .base_policy import IsolationPolicy -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator +from .. import ResourceType +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload @@ -25,8 +25,7 @@ def choose_next_isolator(self) -> bool: resource: ResourceType = self.contentious_resource() if resource is ResourceType.CPU: - self._cur_isolator = self._isolator_map[CoreIsolator] - self._cur_isolator._contentious_resource = ResourceType.CPU + self._cur_isolator = self._isolator_map[SchedIsolator] logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') return True @@ -43,7 +42,7 @@ def choose_next_isolator(self) -> bool: return True elif resource is ResourceType.MEMORY: - self._cur_isolator = self._isolator_map[CoreIsolator] + self._cur_isolator = self._isolator_map[SchedIsolator] self._is_mem_isolated = False logger.info(f'Cpuset Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') return True diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 8e438f0..057dcfd 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -2,9 +2,9 @@ import logging -from .. import ResourceType from .greedy_diff_policy import GreedyDiffPolicy -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator +from .. import ResourceType +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload @@ -22,7 +22,7 @@ def _check_violation(self) -> bool: return \ resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, CoreIsolator)) + and not isinstance(self._cur_isolator, SchedIsolator)) @property def new_isolator_needed(self) -> bool: diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 363c680..7528dae 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -34,6 +34,10 @@ def l3miss(self): def instruction(self): return self._instructions + @property + def instruction_ps(self): + return self._instructions * (1000 / self._interval) + @property def cycles(self): return self._cycles @@ -73,23 +77,23 @@ def req_date(self): return self._req_date @property - def ipc(self): + def ipc(self) -> float: return self._instructions / self._cycles @property - def intra_coh_ratio(self): + def intra_coh_ratio(self) -> float: return self._intra_coh / self._l2miss @property - def inter_coh_ratio(self): + def inter_coh_ratio(self) -> float: return self._inter_coh / self._l2miss @property - def coh_ratio(self): + def coh_ratio(self) -> float: return (self._inter_coh + self._intra_coh) / self._l2miss @property - def l3miss_ratio(self): + def l3miss_ratio(self) -> float: return self._l3miss / self._l2miss @property @@ -98,50 +102,47 @@ def l3hit_ratio(self) -> float: @property def llc_util(self) -> float: - return self._llc_size/LLC_SIZE + return self._llc_size / LLC_SIZE @property - def l3_intensity(self): + def l3_intensity(self) -> float: l3_hit_ratio = 1 - self.l3miss_ratio return self.llc_util * l3_hit_ratio @property - def mem_intensity(self): + def mem_intensity(self) -> float: return self.llc_util * self.l3miss_ratio - def __str__(self): + def __repr__(self) -> str: return ', '.join(map(str, ( self._l2miss, self._l3miss, self._instructions, self._cycles, self._stall_cycles, self._intra_coh, self._inter_coh, self._llc_size, self._req_date))) - def __repr__(self): - return self.__str__() - class MetricDiff: def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: self._l3_hit_ratio = curr.l3hit_ratio - prev.l3hit_ratio self._local_mem_ps = curr.local_mem_ps() / prev.local_mem_ps() - 1 self._remote_mem_ps = curr.remote_mem_ps() / prev.remote_mem_ps() - 1 - self._ipc = curr.ipc() / prev.ipc() - 1 + self._instruction_ps = curr.instruction_ps / prev.instruction_ps - 1 @property - def l3_hit_ratio(self): + def l3_hit_ratio(self) -> float: return self._l3_hit_ratio @property - def local_mem_util_ps(self): + def local_mem_util_ps(self) -> float: return self._local_mem_ps @property - def remote_mem_ps(self): + def remote_mem_ps(self) -> float: return self._remote_mem_ps @property - def ipc(self): - return self._ipc + def instruction_ps(self) -> float: + return self._instruction_ps def __repr__(self) -> str: return f'L3 hit ratio diff: {self._l3_hit_ratio:>6.03f}, ' \ f'Local Memory access diff: {self._local_mem_ps:>6.03f}, ' \ - f'IPC diff: {self.ipc:>6.03f}' + f'instruction diff: {self._instruction_ps:>6.03f}' diff --git a/isolating_controller/utils/numa_topology.py b/isolating_controller/utils/numa_topology.py index a3e3f2f..d2831ca 100644 --- a/isolating_controller/utils/numa_topology.py +++ b/isolating_controller/utils/numa_topology.py @@ -40,7 +40,7 @@ def core_belongs_to(socket_id: int) -> Set[int]: def _node_to_core() -> Dict[int, Set[int]]: node_list = cur_online_nodes() - return dict((socket_id, core_belongs_to(socket_id) for socket_id in node_list)) + return dict((socket_id, core_belongs_to(socket_id)) for socket_id in node_list) def _core_to_node() -> Dict[int, int]: diff --git a/pending_queue.py b/pending_queue.py index d8338a5..52d83b2 100644 --- a/pending_queue.py +++ b/pending_queue.py @@ -1,19 +1,20 @@ # coding: UTF-8 import logging -from typing import Dict, List, Sized, Type +from collections import defaultdict +from typing import DefaultDict, Dict, List, Sized, Tuple, Type from isolating_controller.isolation.policies import IsolationPolicy from isolating_controller.workload import Workload class PendingQueue(Sized): - def __init__(self, policy_type: Type[IsolationPolicy], max_pending: int) -> None: + def __init__(self, policy_type: Type[IsolationPolicy]) -> None: self._policy_type: Type[IsolationPolicy] = policy_type - self._max_pending: int = max_pending - self._cur_ready: int = 0 - self._ready_q: Dict[int, List[Workload]] = dict() # key: socket id, value: workloads + self._bg_q: Dict[Tuple[int, ...], Workload] = dict() + self._fg_q: Dict[Tuple[int, ...], Workload] = dict() + self._ready_queue: DefaultDict[int, List[Workload]] = defaultdict(list) self._pending_list: List[IsolationPolicy] = list() def __len__(self) -> int: @@ -23,33 +24,26 @@ def __len__(self) -> int: def add(self, workload: Workload) -> None: logger = logging.getLogger(__name__) - logger.debug(f'self._cur_ready: {self._cur_ready}') + logger.info(f'{workload} is ready for active') - self._ready_q[workload.cur_socket_id()].append(workload) - self._cur_ready += 1 - if self._cur_ready == self._max_pending: - self._dump_to_pending_list() + ready_queue = self._ready_queue[workload.cur_socket_id()] + ready_queue.append(workload) + + # FIXME: hard coded + if len(ready_queue) is 2 and ready_queue[0].wl_type != ready_queue[1].wl_type: + if ready_queue[0].wl_type == 'fg': + fg = ready_queue[0] + bg = ready_queue[1] + else: + fg = ready_queue[1] + bg = ready_queue[0] + + new_group = self._policy_type(fg, bg) + self._pending_list.append(new_group) + + self._ready_queue[workload.cur_socket_id()] = list() def pop(self) -> IsolationPolicy: if len(self) is 0: raise IndexError(f'{self} is empty') return self._pending_list.pop() - - def _dump_to_pending_list(self) -> None: - logger = logging.getLogger(__name__) - logger.debug('Dumping workloads to pending list!') - - for socket_id, workloads in self._ready_q.items(): - # FIXME: hard coded - if len(workloads) is 2 and workloads[0].wl_type != workloads[1].wl_type: - if workloads[0].wl_type == 'fg': - fg = workloads[0] - bg = workloads[1] - else: - fg = workloads[1] - bg = workloads[0] - new_group = self._policy_type(fg, bg) - self._pending_list.append(new_group) - - def update_max_pending(self, new_max_pending: int): - self._max_pending = new_max_pending diff --git a/swap_iso.py b/swap_iso.py index 3e0bbb0..095f52a 100644 --- a/swap_iso.py +++ b/swap_iso.py @@ -56,8 +56,7 @@ def update_cont_group(self) -> None: if swap_out_grp is None: swap_out_grp = group - # FIXME: replace to property - group.update_aggr_ipc() + group.update_aggr_instr() swap_in_grp = max(swap_in_grp, group, key=lambda x: x.aggr_ipc) swap_out_grp = min(swap_out_grp, group, key=lambda x: x.aggr_ipc) @@ -122,6 +121,9 @@ def pre_swap_setup(self) -> Tuple[CpuSet, CpuSet]: return out_proc, in_proc def try_swap(self) -> None: + if len(self._all_groups) < 2: + return + self.update_cont_group() self.choose_swap_candidates() if self.swap_is_needed: From c81a05549d458a27ed268e605e14a6dc7abc55e7 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 28 Sep 2018 16:53:21 +0900 Subject: [PATCH 20/82] move groups (cgroup, resctrl) into Workload class --- .../isolation/isolators/cache.py | 11 ++--- .../isolation/isolators/memory.py | 6 +-- .../isolation/isolators/schedule.py | 7 +--- .../isolation/policies/base_policy.py | 1 - .../metric_container/basic_metric.py | 7 ++-- isolating_controller/utils/cgroup/cpuset.py | 11 ++++- isolating_controller/utils/resctrl.py | 21 +++++++++- isolating_controller/workload.py | 42 +++++++++++++++---- swap_iso.py | 29 ++++++------- 9 files changed, 89 insertions(+), 46 deletions(-) diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 1c63694..86923b3 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -19,9 +19,6 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._prev_step: Optional[int] = None self._cur_step: Optional[int] = None - self._fg_resctrl = ResCtrl(f'{foreground_wl.name}_{foreground_wl.pid}') - self._bg_resctrl = ResCtrl(f'{background_wl.name}_{background_wl.pid}') - def strengthen(self) -> 'CacheIsolator': self._prev_step = self._cur_step @@ -66,12 +63,12 @@ def _enforce(self) -> None: # FIXME: hard coded -> The number of socket is two at most masks = [ResCtrl.MIN_MASK, ResCtrl.MIN_MASK] masks[self._foreground_wl.cur_socket_id()] = ResCtrl.gen_mask(self._cur_step) - self._fg_resctrl.assign_llc(*masks) + self._foreground_wl.resctrl.assign_llc(*masks) # FIXME: hard coded -> The number of socket is two at most masks = [ResCtrl.MIN_MASK, ResCtrl.MIN_MASK] masks[self._background_wl.cur_socket_id()] = ResCtrl.gen_mask(self._cur_step) - self._bg_resctrl.assign_llc(*masks) + self._background_wl.resctrl.assign_llc(*masks) def _first_decision(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() @@ -129,8 +126,8 @@ def reset(self) -> None: if self._background_wl.is_running: bg_masks = masks.copy() bg_masks[self._background_wl.cur_socket_id()] = ResCtrl.MAX_MASK - ResCtrl.assign_llc(self._bg_resctrl, *bg_masks) + self._background_wl.resctrl.assign_llc(*bg_masks) if self._foreground_wl.is_running: masks[self._foreground_wl.cur_socket_id()] = ResCtrl.MAX_MASK - ResCtrl.assign_llc(self._fg_resctrl, *masks) + self._foreground_wl.resctrl.assign_llc(*masks) diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index d275b4d..2319baa 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -16,7 +16,7 @@ class MemoryIsolator(Isolator): def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) - self._bg_affinity = background_wl.cpuset + self._bg_affinity = background_wl.bound_cores # FIXME: hard coded self._cur_step = DVFS.MAX @@ -41,9 +41,9 @@ def is_min_level(self) -> bool: def _enforce(self) -> None: logger = logging.getLogger(__name__) - logger.info(f'frequency of cpuset {self._background_wl.cpuset} is {self._cur_step / 1_000_000}GHz') + logger.info(f'frequency of bound_cores {self._background_wl.bound_cores} is {self._cur_step / 1_000_000}GHz') - DVFS.set_freq(self._cur_step, self._background_wl.cpuset) + DVFS.set_freq(self._cur_step, self._background_wl.bound_cores) def _first_decision(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index 78e4d2a..82840c3 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -4,7 +4,6 @@ from .base_isolator import Isolator from .. import NextStep -from ...utils.cgroup.cpuset import CpuSet from ...workload import Workload @@ -24,8 +23,6 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: # FIXME: hard coded self._prev_bg_affinity = range(8, 16) if background_wl.cur_socket_id() is 0 else range(24, 32) - self._bg_grp = CpuSet(background_wl.group_name) - def strengthen(self) -> 'SchedIsolator': self._cur_step += 1 return self @@ -59,7 +56,7 @@ def _enforce(self) -> None: logger.info(f'affinity of background is {self._cur_step}-15') # FIXME: hard coded - self._bg_grp.assign_cpus(range(self._cur_step, 32 if self._background_wl.cur_socket_id() is 1 else 16)) + self._background_wl.bound_cores = range(self._cur_step, 32 if self._background_wl.cur_socket_id() is 1 else 16) def _first_decision(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() @@ -107,4 +104,4 @@ def _monitoring_result(self) -> NextStep: def reset(self) -> None: if self._background_wl.is_running: - self._bg_grp.assign_cpus(self._prev_bg_affinity) + self._background_wl.bound_cores = self._prev_bg_affinity diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 0e73650..4568af3 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -29,7 +29,6 @@ def __hash__(self) -> int: def __repr__(self) -> str: return f'{self.__class__.__name__} ' - # FIXME: If you use policy without CPUIso., then changing ResourceType.Unknown to ResourceType.Memory def init_isolators(self) -> None: self._isolator_map = dict(( (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 7528dae..bcedcb9 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -2,7 +2,9 @@ from time import localtime, strftime -LLC_SIZE: float = 41943040 +from cpuinfo import cpuinfo + +LLC_SIZE = int(cpuinfo.get_cpu_info()['l3_cache_size'].split()[0]) * 1024 class BasicMetric: @@ -106,8 +108,7 @@ def llc_util(self) -> float: @property def l3_intensity(self) -> float: - l3_hit_ratio = 1 - self.l3miss_ratio - return self.llc_util * l3_hit_ratio + return self.llc_util * self.l3hit_ratio @property def mem_intensity(self) -> float: diff --git a/isolating_controller/utils/cgroup/cpuset.py b/isolating_controller/utils/cgroup/cpuset.py index ee81cbd..231b9bc 100644 --- a/isolating_controller/utils/cgroup/cpuset.py +++ b/isolating_controller/utils/cgroup/cpuset.py @@ -2,9 +2,10 @@ import subprocess -from typing import Iterable +from typing import Iterable, Set from .base import BaseCgroup +from ..hyphen import convert_to_set class CpuSet(BaseCgroup): @@ -20,3 +21,11 @@ def assign_mems(self, socket_set: Iterable[int]) -> None: def set_memory_migrate(self, flag: bool) -> None: subprocess.check_call(args=('cgset', '-r', f'cpuset.memory_migrate={int(flag)}', self._group_name)) + + def read_cpus(self) -> Set[int]: + cpus = subprocess.check_output(args=('cgget', '-nvr', 'bound_cores.cpus', self._group_name), encoding='ASCII') + return convert_to_set(cpus) + + def read_mems(self) -> Set[int]: + cpus = subprocess.check_output(args=('cgget', '-nvr', 'bound_cores.mems', self._group_name), encoding='ASCII') + return convert_to_set(cpus) diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index 6ef1a26..a6817de 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -1,7 +1,9 @@ # coding: UTF-8 +import re import subprocess from pathlib import Path +from typing import List, Tuple def len_of_mask(mask: str) -> int: @@ -24,6 +26,7 @@ class ResCtrl: MIN_BITS: int = int((MOUNT_POINT / 'info' / 'L3' / 'min_cbm_bits').read_text()) MIN_MASK: str = bits_to_mask(MIN_BITS) STEP = 1 + _read_regex: re = re.compile(r'L3:((\d+=[0-9a-fA-F]+;?)*)', re.MULTILINE) def __init__(self, group_name: str) -> None: self._group_name: str = group_name @@ -45,9 +48,23 @@ def add_task(self, pid: int) -> None: def assign_llc(self, *masks: str) -> None: masks = (f'{i}={mask}' for i, mask in enumerate(masks)) mask = ';'.join(masks) - subprocess.run(args=('sudo', 'tee', str(ResCtrl.MOUNT_POINT / self._group_name / 'schemata')), + subprocess.run(args=('sudo', 'tee', str(self._group_path / 'schemata')), input=f'L3:{mask}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) + def read_assigned_llc(self) -> Tuple[int, ...]: + schemata = self._group_path / 'schemata' + if not schemata.is_file(): + raise ProcessLookupError() + + with schemata.open() as fp: + content: str = fp.read().strip() + + l3_schemata = ResCtrl._read_regex.search(content).group(1) + + # example: [('0', '00fff'), ('1', 'fff00')] + pairs: List[Tuple[str, str]] = sorted(tuple(pair.split('=')) for pair in l3_schemata.split(';')) + return tuple(len_of_mask(mask) for socket, mask in pairs) + @staticmethod def gen_mask(start: int, end: int = None) -> str: if end is None or end > ResCtrl.MAX_BITS: @@ -59,4 +76,4 @@ def gen_mask(start: int, end: int = None) -> str: return format(((1 << (end - start)) - 1) << (ResCtrl.MAX_BITS - end), 'x') def remove_group(self) -> None: - subprocess.check_call(args=('sudo', 'rmdir', str(ResCtrl.MOUNT_POINT / self._group_name))) + subprocess.check_call(args=('sudo', 'rmdir', str(self._group_path))) diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 8a23099..3101afa 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -2,16 +2,14 @@ from collections import deque from itertools import chain -from typing import Deque, Tuple +from typing import Deque, Iterable, Tuple -import cpuinfo import psutil from .metric_container.basic_metric import BasicMetric, MetricDiff from .solorun_data.datas import data_map -from .utils import numa_topology - -L3_SIZE = int(cpuinfo.get_cpu_info()['l3_cache_size'].split()[0]) * 1024 +from .utils import ResCtrl, numa_topology +from .utils.cgroup import Cpu, CpuSet class Workload: @@ -32,12 +30,28 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._proc_info = psutil.Process(pid) self._ipc_diff: float = None + self._cgroup_cpuset = CpuSet(self.group_name) + self._cgroup_cpu = Cpu(self.group_name) + self._resctrl = ResCtrl(self.group_name) + def __repr__(self) -> str: return f'{self._name} (pid: {self._pid})' def __hash__(self) -> int: return self._pid + @property + def cgroup_cpuset(self) -> CpuSet: + return self._cgroup_cpuset + + @property + def cgroup_cpu(self) -> Cpu: + return self._cgroup_cpu + + @property + def resctrl(self) -> ResCtrl: + return self._resctrl + @property def name(self) -> str: return self._name @@ -55,8 +69,20 @@ def metrics(self) -> Deque[BasicMetric]: return self._metrics @property - def cpuset(self) -> Tuple[int, ...]: - return tuple(self._proc_info.cpu_affinity()) + def bound_cores(self) -> Tuple[int, ...]: + return tuple(self._cgroup_cpuset.read_cpus()) + + @bound_cores.setter + def bound_cores(self, core_ids: Iterable[int]): + self._cgroup_cpuset.assign_cpus(core_ids) + + @property + def bound_mems(self) -> Tuple[int, ...]: + return tuple(self._cgroup_cpuset.read_mems()) + + @bound_mems.setter + def bound_mems(self, affinity: Iterable[int]): + self._cgroup_cpuset.assign_mems(affinity) @property def perf_pid(self) -> int: @@ -94,7 +120,7 @@ def all_child_tid(self) -> Tuple[int, ...]: return tuple() def cur_socket_id(self) -> int: - sockets = frozenset(numa_topology.core_to_node[core_id] for core_id in self.cpuset) + sockets = frozenset(numa_topology.core_to_node[core_id] for core_id in self.bound_cores) # FIXME: hard coded if len(sockets) is not 1: diff --git a/swap_iso.py b/swap_iso.py index 095f52a..7634712 100644 --- a/swap_iso.py +++ b/swap_iso.py @@ -4,10 +4,9 @@ import os import signal from enum import IntEnum -from typing import Dict, Optional, Tuple +from typing import Dict, Optional from isolating_controller.isolation.policies.base_policy import IsolationPolicy -from isolating_controller.utils.cgroup import CpuSet from isolating_controller.workload import Workload @@ -90,8 +89,6 @@ def swap_is_needed(self) -> bool: def do_swap(self) -> None: # Enable CPUSET memory migration - out_cgroup, in_cgroup = self.pre_swap_setup() - out_wl = self._swap_candidates[SwapNextStep.OUT] in_wl = self._swap_candidates[SwapNextStep.IN] @@ -99,26 +96,26 @@ def do_swap(self) -> None: os.kill(out_wl.pid, signal.SIGSTOP) os.kill(in_wl.pid, signal.SIGSTOP) - out_cgroup.assign_cpus(in_wl.cpuset) - out_cgroup.assign_mems((in_wl.cur_socket_id(),)) - in_cgroup.assign_cpus(out_wl.cpuset) - in_cgroup.assign_mems((out_wl.cur_socket_id(),)) + out_cpus = out_wl.bound_cores + out_mems = out_wl.mems + in_cpus = in_wl.bound_cores + in_mems = in_wl.mems + + out_wl.bound_cores = in_cpus + out_wl.mems = in_mems + in_wl.bound_cores = out_cpus + in_wl.mems = out_mems # Resume Procs os.kill(out_wl.pid, signal.SIGCONT) os.kill(in_wl.pid, signal.SIGCONT) - def pre_swap_setup(self) -> Tuple[CpuSet, CpuSet]: + def pre_swap_setup(self) -> None: swap_out_workload = self._swap_candidates[SwapNextStep.OUT] swap_in_workload = self._swap_candidates[SwapNextStep.IN] - out_proc = CpuSet(swap_out_workload.group_name) - in_proc = CpuSet(swap_in_workload.group_name) - - out_proc.set_memory_migrate(True) - in_proc.set_memory_migrate(True) - - return out_proc, in_proc + swap_out_workload.cgroup_cpuset.set_memory_migrate(True) + swap_in_workload.cgroup_cpuset.set_memory_migrate(True) def try_swap(self) -> None: if len(self._all_groups) < 2: From e8cd6e56591d86f3562d7aeb480fd75abeb7e167 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 28 Sep 2018 16:53:54 +0900 Subject: [PATCH 21/82] remove utils/cgroup.py --- isolating_controller/utils/cgroup.py | 68 ---------------------------- 1 file changed, 68 deletions(-) delete mode 100644 isolating_controller/utils/cgroup.py diff --git a/isolating_controller/utils/cgroup.py b/isolating_controller/utils/cgroup.py deleted file mode 100644 index 1d933fa..0000000 --- a/isolating_controller/utils/cgroup.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: UTF-8 - - -import getpass -import grp -import os -import subprocess -from typing import Iterable, Optional, Set - -from .hyphen import convert_to_set - - -# TODO: delete -class Cgroup: - CPUSET_MOUNT_POINT = '/sys/fs/cgroup/cpuset' - CPU_MOUNT_POINT = '/sys/fs/cgroup/cpu' - - def __init__(self, group_name: str, controllers: str) -> None: - self._group_name: str = group_name - self._controllers: str = controllers - self._group_path: str = f'{controllers}:{group_name}' - - def create_group(self) -> None: - uname: str = getpass.getuser() - gid: int = os.getegid() - gname: str = grp.getgrgid(gid).gr_name - - subprocess.check_call(args=( - 'sudo', 'cgcreate', '-a', f'{uname}:{gname}', '-d', '700', '-f', - '600', '-t', f'{uname}:{gname}', '-s', '600', '-g', self._group_path)) - - def assign_cpus(self, core_set: Iterable[int]) -> None: - core_ids = ','.join(map(str, core_set)) - subprocess.check_call(args=('cgset', '-r', f'cpuset.cpus={core_ids}', self._group_name)) - - def assign_mems(self, socket_set: Set[int]) -> None: - mem_ids = ','.join(map(str, socket_set)) - subprocess.check_call(args=('cgset', '-r', f'cpuset.mems={mem_ids}', self._group_name)) - - def _get_cpu_affinity_from_group(self) -> Set[int]: - with open(f'{Cgroup.CPUSET_MOUNT_POINT}/{self._group_name}/cpuset.cpus', "r") as fp: - line: str = fp.readline() - core_set: Set[int] = convert_to_set(line) - return core_set - - def limit_cpu_quota(self, limit_percentage: float, period: Optional[int] = None) -> None: - if period is None: - with open(f'{Cgroup.CPU_MOUNT_POINT}/cpu.cfs_period_us', "r") as fp: - line: str = fp.readline() - period = int(line) - - cpu_cores = self._get_cpu_affinity_from_group() - quota = int(period * limit_percentage / 100 * len(cpu_cores)) - subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_quota_us={quota}', self._group_name)) - - subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_period_us={period}', self._group_name)) - - def add_tasks(self, pids: Iterable[int]) -> None: - subprocess.check_call(args=('cgclassify', '-g', self._group_path, '--sticky', *map(str, pids))) - - def delete(self) -> None: - subprocess.check_call(args=('sudo', 'cgdelete', '-r', '-g', self._group_path)) - - def enable_memory_migrate(self) -> None: - subprocess.check_call(args=('cgset', '-r', f'cpuset.memory_migrate=1', self._group_name)) - - def disable_memory_migrate(self) -> None: - subprocess.check_call(args=('cgset', '-r', f'cpuset.memory_migrate=0', self._group_name)) \ No newline at end of file From 7ac4e14876daa365819f620307a9684264274070 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 28 Sep 2018 20:45:59 +0900 Subject: [PATCH 22/82] fixes typos and rename variables --- isolating_controller/isolation/isolators/memory.py | 5 ++--- isolating_controller/utils/cgroup/cpuset.py | 4 ++-- isolating_controller/utils/numa_topology.py | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index 2319baa..41b4d96 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -1,7 +1,6 @@ # coding: UTF-8 import logging -from itertools import chain from .base_isolator import Isolator from .. import NextStep @@ -16,7 +15,7 @@ class MemoryIsolator(Isolator): def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) - self._bg_affinity = background_wl.bound_cores + self._orig_bg_affinity = background_wl.bound_cores # FIXME: hard coded self._cur_step = DVFS.MAX @@ -94,4 +93,4 @@ def _monitoring_result(self) -> NextStep: return NextStep.STRENGTHEN def reset(self) -> None: - DVFS.set_freq(DVFS.MAX, chain(self._bg_affinity)) + DVFS.set_freq(DVFS.MAX, self._orig_bg_affinity) diff --git a/isolating_controller/utils/cgroup/cpuset.py b/isolating_controller/utils/cgroup/cpuset.py index 231b9bc..22ba9b0 100644 --- a/isolating_controller/utils/cgroup/cpuset.py +++ b/isolating_controller/utils/cgroup/cpuset.py @@ -23,9 +23,9 @@ def set_memory_migrate(self, flag: bool) -> None: subprocess.check_call(args=('cgset', '-r', f'cpuset.memory_migrate={int(flag)}', self._group_name)) def read_cpus(self) -> Set[int]: - cpus = subprocess.check_output(args=('cgget', '-nvr', 'bound_cores.cpus', self._group_name), encoding='ASCII') + cpus = subprocess.check_output(args=('cgget', '-nvr', 'cpuset.cpus', self._group_name), encoding='ASCII') return convert_to_set(cpus) def read_mems(self) -> Set[int]: - cpus = subprocess.check_output(args=('cgget', '-nvr', 'bound_cores.mems', self._group_name), encoding='ASCII') + cpus = subprocess.check_output(args=('cgget', '-nvr', 'cpuset.mems', self._group_name), encoding='ASCII') return convert_to_set(cpus) diff --git a/isolating_controller/utils/numa_topology.py b/isolating_controller/utils/numa_topology.py index d2831ca..c3868c0 100644 --- a/isolating_controller/utils/numa_topology.py +++ b/isolating_controller/utils/numa_topology.py @@ -1,7 +1,7 @@ # coding: UTF-8 from pathlib import Path -from typing import Dict, Set +from typing import Dict, Mapping, Set from .hyphen import convert_to_set @@ -54,5 +54,5 @@ def _core_to_node() -> Dict[int, int]: return ret_dict -node_to_core: Dict[int, Set[int]] = _node_to_core() # key: socket id, value: corresponding core ids -core_to_node: Dict[int, int] = _core_to_node() # key: core id, value: corresponding socket id +node_to_core: Mapping[int, Set[int]] = _node_to_core() # key: socket id, value: corresponding core ids +core_to_node: Mapping[int, int] = _core_to_node() # key: core id, value: corresponding socket id From 653839099400a65198204225564cb4d9337fa72b Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 28 Sep 2018 22:31:08 +0900 Subject: [PATCH 23/82] finish CoreIsolator --- .../isolation/isolators/core.py | 200 ++++++------------ .../isolation/policies/base_policy.py | 4 +- .../isolation/policies/greedy_diff_policy.py | 8 +- isolating_controller/workload.py | 21 +- 4 files changed, 88 insertions(+), 145 deletions(-) diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index b2226ea..a7383ab 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -1,12 +1,9 @@ # coding: UTF-8 import logging -from typing import Optional, Tuple from .base_isolator import Isolator from .. import NextStep, ResourceType -from ...utils import hyphen, numa_topology -from ...utils.cgroup import CpuSet from ...workload import Workload @@ -14,19 +11,17 @@ class CoreIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload, cont_resource: Optional[ResourceType]) -> None: - super().__init__(foreground_wl, background_wl, cont_resource) + def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: + super().__init__(foreground_wl, background_wl) - self._fg_cpuset: Tuple[int, ...] = foreground_wl.cpuset - self._bg_cpuset: Tuple[int, ...] = background_wl.cpuset - self._cur_bg_step: int = min(self._bg_cpuset) - self._cur_fg_step: int = max(self._fg_cpuset) + # FIXME: hard coded (contiguous allocation) + self._cur_fg_step: int = foreground_wl.orig_bound_cores[-1] + self._cur_bg_step: int = background_wl.orig_bound_cores[0] - self._prev_fg_affinity: Tuple[int, ...] = foreground_wl.cpuset - self._prev_bg_affinity: Tuple[int, ...] = background_wl.cpuset + self._bg_next_step: NextStep = NextStep.IDLE + self._fg_next_step: NextStep = NextStep.IDLE - self._fg_cgroup = CpuSet(foreground_wl.group_name) - self._bg_cgroup = CpuSet(background_wl.group_name) + self._contentious_resource: ResourceType = ResourceType.MEMORY def strengthen(self) -> 'CoreIsolator': """ @@ -34,22 +29,13 @@ def strengthen(self) -> 'CoreIsolator': TODO: Changing step size, if needed """ # NOTE: Caller is assumed that BG workload - logger = logging.getLogger(__name__) - logger.debug(f'self._cur_bg_step: {self._cur_bg_step}') - logger.debug(f'self._cur_fg_step: {self._cur_fg_step}') - logger.debug(f'self._bg_next_step: {self._bg_next_step.name}') - logger.debug(f'self._fg_next_step: {self._fg_next_step.name}') if self._bg_next_step == NextStep.STRENGTHEN: - bg_cpuset = set(self._bg_cpuset) - bg_cpuset.remove(self._cur_bg_step) - self._bg_cpuset = tuple(bg_cpuset) self._cur_bg_step += 1 + if self._fg_next_step == NextStep.WEAKEN: - fg_cpuset = set(self._fg_cpuset) self._cur_fg_step += 1 - fg_cpuset.add(self._cur_fg_step) - self._fg_cpuset = tuple(fg_cpuset) + return self def weaken(self) -> 'CoreIsolator': @@ -58,68 +44,30 @@ def weaken(self) -> 'CoreIsolator': TODO: Changing step size, if needed """ # NOTE: Caller is assumed that BG workload - logger = logging.getLogger(__name__) - logger.debug(f'self._cur_bg_step: {self._cur_bg_step}') - logger.debug(f'self._cur_fg_step: {self._cur_fg_step}') - logger.debug(f'self._bg_next_step: {self._bg_next_step.name}') - logger.debug(f'self._fg_next_step: {self._fg_next_step.name}') if self._bg_next_step == NextStep.WEAKEN: - bg_cpuset = set(self._bg_cpuset) self._cur_bg_step -= 1 - bg_cpuset.add(self._cur_bg_step) - self._bg_cpuset = tuple(bg_cpuset) + if self._fg_next_step == NextStep.STRENGTHEN: - fg_cpuset = set(self._fg_cpuset) - fg_cpuset.remove(self._cur_fg_step) - self._fg_cpuset = tuple(fg_cpuset) self._cur_fg_step -= 1 + return self @property def is_max_level(self) -> bool: - logger = logging.getLogger(__name__) - logger.debug(f'bg max cpuset: {max(numa_topology.node_to_core[self._background_wl.cur_socket_id()])}') - logger.debug(f'self._cur_bg_step: {self._cur_bg_step}') - logger.debug(f'self._cur_fg_step: {self._cur_fg_step}') - logger.debug(f'self._bg_next_step: {self._bg_next_step.name}') - logger.debug(f'self._fg_next_step: {self._fg_next_step.name}') - - # FIXME: hard coded (Background can take lower cores) - # FIXME: How about first condition is true but the other is false? - if self._cur_bg_step == max(numa_topology.node_to_core[self._background_wl.cur_socket_id()]): - self._bg_next_step = NextStep.STOP - return True - # if self._cur_fg_step == self._cur_bg_step-1: - # self._fg_next_step = NextStep.STOP - else: - return False + # FIXME: hard coded (contiguous allocation) + return self._cur_bg_step == self._background_wl.orig_bound_cores[-1] and \ + self._cur_fg_step == self._cur_bg_step - 1 @property def is_min_level(self) -> bool: - logger = logging.getLogger(__name__) - logger.debug(f'self._cur_bg_step: {self._cur_bg_step}') - logger.debug(f'self._cur_fg_step: {self._cur_fg_step}') - logger.debug(f'self._bg_next_step: {self._bg_next_step.name}') - logger.debug(f'self._fg_next_step: {self._fg_next_step.name}') - - # FIXME: How about first condition is true but the other is false? - if self._cur_bg_step == self._cur_fg_step + 1: - return True - # if self._cur_fg_step == min(self._cpu_topo[self._foreground_wl.socket_id]): - # return True - else: - return False + return self._cur_bg_step == self._background_wl.orig_bound_cores[0] and \ + self._cur_fg_step == self._foreground_wl.orig_bound_cores[-1] def _enforce(self) -> None: - logger = logging.getLogger(__name__) - logger.debug(f'after enforcing : self._cur_bg_step is {self._cur_bg_step}') - logger.debug(f'after enforcing : self._cur_fg_step is {self._cur_fg_step}') - logger.debug(f'after enforcing : affinity of background is {hyphen.convert_to_hyphen(self._bg_cpuset)}') - logger.debug(f'after enforcing : affinity of foreground is {hyphen.convert_to_hyphen(self._fg_cpuset)}') - - self._bg_cgroup.assign_cpus(set(self._bg_cpuset)) - self._fg_cgroup.assign_cpus(set(self._fg_cpuset)) + # FIXME: hard coded (contiguous allocation) + self._foreground_wl.bound_cores = range(self._foreground_wl.orig_bound_cores[0], self._cur_fg_step) + self._background_wl.bound_cores = range(self._cur_bg_step, self._background_wl.orig_bound_cores[-1]) def _first_decision(self) -> NextStep: curr_diff = None @@ -134,36 +82,28 @@ def _first_decision(self) -> NextStep: logger.debug(f'current diff: {curr_diff:>7.4f}') # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) - fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.instruction_ps) - fg_weaken_cond = self.fg_weaken_cond(metric_diff.instruction_ps) if curr_diff < 0: if self.is_max_level: - self._bg_next_step = NextStep.STOP return NextStep.STOP else: - self._bg_next_step = NextStep.STRENGTHEN - if fg_weaken_cond: - self._fg_next_step = NextStep.WEAKEN - return NextStep.STRENGTHEN + return self._strengthen_condition(metric_diff.instruction_ps) + elif curr_diff <= CoreIsolator._FORCE_THRESHOLD: - self._bg_next_step = NextStep.STOP return NextStep.STOP + else: if self.is_min_level: - self._bg_next_step = NextStep.STOP return NextStep.STOP else: - self._bg_next_step = NextStep.WEAKEN - if fg_strengthen_cond: - self._fg_next_step = NextStep.STRENGTHEN - return NextStep.WEAKEN + return self._weaken_condition(metric_diff.instruction_ps) def _monitoring_result(self) -> NextStep: + logger = logging.getLogger(__name__) + logger.info(f'self._contentious_resource: {self._contentious_resource.name}') + metric_diff = self._foreground_wl.calc_metric_diff() curr_diff = None diff_of_diff = None - logger = logging.getLogger(__name__) - logger.info(f'self._contentious_resource: {self._contentious_resource.name}') if self._contentious_resource == ResourceType.MEMORY: curr_diff = metric_diff.local_mem_util_ps prev_diff = self._prev_metric_diff.local_mem_util_ps @@ -173,75 +113,61 @@ def _monitoring_result(self) -> NextStep: prev_diff = self._prev_metric_diff.instruction_ps diff_of_diff = curr_diff - prev_diff - logger = logging.getLogger(__name__) logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) - fg_strengthen_cond = self.fg_strengthen_cond(metric_diff.instruction_ps) - fg_weaken_cond = self.fg_weaken_cond(metric_diff.instruction_ps) - - logger = logging.getLogger(__name__) - logger.info(f'metric_diff.instruction_ps: {metric_diff.instruction_ps}') - logger.info(f'self.fg_strengthen_cond: {fg_strengthen_cond}') - logger.info(f'self.fg_weaken_cond: {fg_weaken_cond}') - # Case1 : diff is too small to perform isolation - if abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ + if self.is_max_level or self.is_min_level \ + or abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: - self._bg_next_step = NextStep.STOP - # self._fg_next_step = NextStep.STOP # This line depends on bg status return NextStep.STOP # Case2 : FG shows lower contention than solo-run -> Slower FG or Faster BG elif curr_diff > 0: + return self._weaken_condition(metric_diff.instruction_ps) + + # Case3 : FG shows higher contention than solo-run + else: + return self._strengthen_condition(metric_diff.instruction_ps) + + def _weaken_condition(self, fg_instruction_ps: float) -> NextStep: + if self._cur_bg_step == self._background_wl.orig_bound_cores[0]: + self._bg_next_step = NextStep.IDLE + else: self._bg_next_step = NextStep.WEAKEN - if self.bg_outside_boundary(): - self._bg_next_step = NextStep.STOP - if fg_strengthen_cond is True: - self._fg_next_step = NextStep.STRENGTHEN - elif fg_strengthen_cond is False: - self._fg_next_step = NextStep.STOP + + # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) + # FIXME: hard coded (contiguous allocation) + if fg_instruction_ps > 0 and self._foreground_wl.orig_bound_cores[-1] < self._cur_fg_step: + self._fg_next_step = NextStep.STRENGTHEN + else: + self._fg_next_step = NextStep.IDLE + + if self._bg_next_step is NextStep.IDLE and self._fg_next_step is NextStep.IDLE: + return NextStep.STOP + else: return NextStep.WEAKEN - # Case3 : FG shows higher contention than solo-run + def _strengthen_condition(self, fg_instruction_ps: float) -> NextStep: + if self._cur_bg_step == self._background_wl.orig_bound_cores[-1]: + self._bg_next_step = NextStep.IDLE else: self._bg_next_step = NextStep.STRENGTHEN - if self.bg_outside_boundary(): - self._bg_next_step = NextStep.STOP - if fg_weaken_cond: - self._fg_next_step = NextStep.WEAKEN - elif fg_weaken_cond is False: - self._fg_next_step = NextStep.STOP - return NextStep.STRENGTHEN - def bg_outside_boundary(self) -> bool: - # FIXME: Assumption about fg's cpuset IDs are smaller than bg's ones. (kind of hard coded) - max_bg_cpuid = max(numa_topology.node_to_core[self._background_wl.cur_socket_id()]) - min_bg_cpuid = max(self._fg_cpuset) + 1 - if not (min_bg_cpuid < self._cur_bg_step < max_bg_cpuid): - return True + # FIXME: hard coded (contiguous allocation) + if fg_instruction_ps < 0 \ + and (self._bg_next_step is NextStep.STRENGTHEN or self._cur_bg_step - self._cur_fg_step > 1): + self._fg_next_step = NextStep.WEAKEN else: - return False + self._fg_next_step = NextStep.IDLE - def fg_strengthen_cond(self, fg_ipc_diff) -> bool: - min_skt_cpuid = min(numa_topology.node_to_core[self._foreground_wl.cur_socket_id()]) - if fg_ipc_diff > 0 and self._cur_fg_step > min_skt_cpuid: - return True - else: - return False - - def fg_weaken_cond(self, fg_ipc_diff) -> bool: - if fg_ipc_diff <= 0: - free_cpu = self._cur_bg_step - self._cur_fg_step - if (free_cpu > 0 and self._bg_next_step != NextStep.WEAKEN) \ - or (free_cpu == 0 and self._bg_next_step == NextStep.STOP): - return True + if self._bg_next_step is NextStep.IDLE and self._fg_next_step is NextStep.IDLE: + return NextStep.STOP else: - return False + return NextStep.STRENGTHEN def reset(self) -> None: if self._background_wl.is_running: - self._bg_cgroup.assign_cpus(self._prev_bg_affinity) + self._background_wl.bound_cores = self._background_wl.orig_bound_cores if self._foreground_wl.is_running: - self._fg_cgroup.assign_cpus(self._prev_fg_affinity) + self._foreground_wl.bound_cores = self._foreground_wl.orig_bound_cores diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 4568af3..afad3c2 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -4,7 +4,7 @@ from typing import Mapping, Type from .. import ResourceType -from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator from ...metric_container.basic_metric import BasicMetric, MetricDiff from ...workload import Workload @@ -33,7 +33,7 @@ def init_isolators(self) -> None: self._isolator_map = dict(( (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), - (SchedIsolator, SchedIsolator(self._fg_wl, self._bg_wl)) + (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)) )) @property diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index b49ad49..9b4624c 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -4,7 +4,7 @@ from .base_policy import IsolationPolicy from .. import ResourceType -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload @@ -25,7 +25,8 @@ def choose_next_isolator(self) -> bool: resource: ResourceType = self.contentious_resource() if resource is ResourceType.CPU: - self._cur_isolator = self._isolator_map[SchedIsolator] + self._cur_isolator = self._isolator_map[CoreIsolator] + self._cur_isolator._contentious_resource = ResourceType.CPU logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') return True @@ -42,7 +43,8 @@ def choose_next_isolator(self) -> bool: return True elif resource is ResourceType.MEMORY: - self._cur_isolator = self._isolator_map[SchedIsolator] + self._cur_isolator = self._isolator_map[CoreIsolator] + self._cur_isolator._contentious_resource = ResourceType.MEMORY self._is_mem_isolated = False logger.info(f'Cpuset Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') return True diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 3101afa..de0124d 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -2,7 +2,7 @@ from collections import deque from itertools import chain -from typing import Deque, Iterable, Tuple +from typing import Deque, Iterable, Set, Tuple import psutil @@ -34,6 +34,9 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._cgroup_cpu = Cpu(self.group_name) self._resctrl = ResCtrl(self.group_name) + self._orig_bound_cores: Tuple[int, ...] = tuple(self._cgroup_cpuset.read_cpus()) + self._orig_bound_mems: Set[int] = self._cgroup_cpuset.read_mems() + def __repr__(self) -> str: return f'{self._name} (pid: {self._pid})' @@ -76,6 +79,14 @@ def bound_cores(self) -> Tuple[int, ...]: def bound_cores(self, core_ids: Iterable[int]): self._cgroup_cpuset.assign_cpus(core_ids) + @property + def orig_bound_cores(self) -> Tuple[int, ...]: + return self._orig_bound_cores + + @orig_bound_cores.setter + def orig_bound_cores(self, orig_bound_cores: Tuple[int, ...]) -> None: + self._orig_bound_cores = orig_bound_cores + @property def bound_mems(self) -> Tuple[int, ...]: return tuple(self._cgroup_cpuset.read_mems()) @@ -85,8 +96,12 @@ def bound_mems(self, affinity: Iterable[int]): self._cgroup_cpuset.assign_mems(affinity) @property - def perf_pid(self) -> int: - return self._perf_pid + def orig_bound_mems(self) -> Set[int]: + return self._orig_bound_mems + + @orig_bound_mems.setter + def orig_bound_mems(self, orig_bound_mems: Set[int]) -> None: + self._orig_bound_mems = orig_bound_mems @property def perf_interval(self): From 259aca094853dbf55a6f12cadf08bbfc16118d14 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 28 Sep 2018 22:31:30 +0900 Subject: [PATCH 24/82] improve isolator's conditions --- .../isolation/isolators/cache.py | 13 ++------ .../isolation/isolators/memory.py | 12 ++------ .../isolation/isolators/schedule.py | 30 +++++-------------- 3 files changed, 13 insertions(+), 42 deletions(-) diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 86923b3..6bc1efb 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -102,23 +102,16 @@ def _monitoring_result(self) -> NextStep: logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - if self._cur_step is not None \ - and not (ResCtrl.MIN_BITS < self._cur_step < ResCtrl.MAX_BITS) \ + if self.is_min_level or self.is_max_level \ or abs(diff_of_diff) <= CacheIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= CacheIsolator._DOD_THRESHOLD: return NextStep.STOP elif curr_diff > 0: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.WEAKEN + return NextStep.WEAKEN else: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN + return NextStep.STRENGTHEN def reset(self) -> None: masks = [ResCtrl.MIN_MASK] * (max(numa_topology.cur_online_nodes()) + 1) diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index 41b4d96..66f25e5 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -75,22 +75,16 @@ def _monitoring_result(self) -> NextStep: logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - if not (DVFS.MIN < self._cur_step < DVFS.MAX) \ + if self.is_min_level or self.is_max_level \ or abs(diff_of_diff) <= MemoryIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= MemoryIsolator._DOD_THRESHOLD: return NextStep.STOP elif curr_diff > 0: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.WEAKEN + return NextStep.WEAKEN else: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN + return NextStep.STRENGTHEN def reset(self) -> None: DVFS.set_freq(DVFS.MAX, self._orig_bg_affinity) diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index 82840c3..34d08e5 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -15,13 +15,7 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) # FIXME: hard coded - if background_wl.cur_socket_id() is 1: - self._cur_step = 24 - else: - self._cur_step = 8 - - # FIXME: hard coded - self._prev_bg_affinity = range(8, 16) if background_wl.cur_socket_id() is 0 else range(24, 32) + self._cur_step = background_wl.orig_bound_cores[0] def strengthen(self) -> 'SchedIsolator': self._cur_step += 1 @@ -34,29 +28,20 @@ def weaken(self) -> 'SchedIsolator': @property def is_max_level(self) -> bool: # FIXME: hard coded - if self._background_wl.cur_socket_id() is 1: - return self._cur_step == 31 - else: - return self._cur_step == 15 + return self._cur_step == self._background_wl.orig_bound_cores[-1] @property def is_min_level(self) -> bool: # FIXME: hard coded - if self._background_wl.cur_socket_id() is 1: - return self._cur_step == 24 - else: - return self._cur_step == 8 + return self._cur_step == self._background_wl.orig_bound_cores[0] def _enforce(self) -> None: logger = logging.getLogger(__name__) # FIXME: hard coded - if self._background_wl.cur_socket_id() is 1: - logger.info(f'affinity of background is {self._cur_step}-31') - else: - logger.info(f'affinity of background is {self._cur_step}-15') + logger.info(f'affinity of background is {self._cur_step}-{self._background_wl.orig_bound_cores[-1]}') # FIXME: hard coded - self._background_wl.bound_cores = range(self._cur_step, 32 if self._background_wl.cur_socket_id() is 1 else 16) + self._background_wl.bound_cores = range(self._cur_step, self._background_wl.orig_bound_cores[-1] + 1) def _first_decision(self) -> NextStep: metric_diff = self._foreground_wl.calc_metric_diff() @@ -90,8 +75,7 @@ def _monitoring_result(self) -> NextStep: logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') # FIXME: hard coded - if (self._background_wl.cur_socket_id() is 1 and not (24 < self._cur_step < 31) or - self._background_wl.cur_socket_id() is 0 and not (8 < self._cur_step < 15)) \ + if self.is_min_level or self.is_max_level \ or abs(diff_of_diff) <= SchedIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= SchedIsolator._DOD_THRESHOLD: return NextStep.STOP @@ -104,4 +88,4 @@ def _monitoring_result(self) -> NextStep: def reset(self) -> None: if self._background_wl.is_running: - self._background_wl.bound_cores = self._prev_bg_affinity + self._background_wl.bound_cores = self._background_wl.orig_bound_cores From a095bd4504ad26c91cb00a901bbf3dcfe73de430 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 29 Sep 2018 14:28:39 +0900 Subject: [PATCH 25/82] added debugging log and fixes range error of CoreIsolator --- isolating_controller/isolation/isolators/core.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index a7383ab..a7d94ff 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -65,9 +65,13 @@ def is_min_level(self) -> bool: self._cur_fg_step == self._foreground_wl.orig_bound_cores[-1] def _enforce(self) -> None: + logger = logging.getLogger(__name__) + logger.debug(f'fg affinity : {self._foreground_wl.orig_bound_cores[0]}-{self._cur_fg_step}') + logger.debug(f'bg affinity : {self._cur_bg_step}-{self._background_wl.orig_bound_cores[-1]}') + # FIXME: hard coded (contiguous allocation) - self._foreground_wl.bound_cores = range(self._foreground_wl.orig_bound_cores[0], self._cur_fg_step) - self._background_wl.bound_cores = range(self._cur_bg_step, self._background_wl.orig_bound_cores[-1]) + self._foreground_wl.bound_cores = range(self._foreground_wl.orig_bound_cores[0], self._cur_fg_step + 1) + self._background_wl.bound_cores = range(self._cur_bg_step, self._background_wl.orig_bound_cores[-1] + 1) def _first_decision(self) -> NextStep: curr_diff = None From 1e27587a7edf8c61576982aaa92fb7bf35ea0f51 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 30 Sep 2018 11:05:19 +0900 Subject: [PATCH 26/82] remove redundant logging statements --- .../isolation/policies/greedy_diff_policy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 9b4624c..609009a 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -32,14 +32,13 @@ def choose_next_isolator(self) -> bool: elif resource is ResourceType.CACHE: self._cur_isolator = self._isolator_map[CacheIsolator] - logger.info(f'Cache Isolation for {self._fg_wl} is started to isolate {ResourceType.CACHE.name}s') + logger.info(f'Cache Isolation for {self._fg_wl} is started') return True elif not self._is_mem_isolated and resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[MemoryIsolator] self._is_mem_isolated = True - logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started ' - f'to isolate {ResourceType.MEMORY.name} BW') + logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started') return True elif resource is ResourceType.MEMORY: From 259279c5e066441851ca1bbeac34774ff55f5c38 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 30 Sep 2018 14:49:01 +0900 Subject: [PATCH 27/82] finish swapper --- controller.py | 3 +- .../isolation/isolators/base_isolator.py | 7 ++ .../isolation/isolators/core.py | 4 +- .../isolation/isolators/memory.py | 4 +- .../isolation/policies/base_policy.py | 29 ++++- .../isolation/policies/greedy_diff_policy.py | 2 +- .../greedy_diff_with_violation_policy.py | 4 +- isolating_controller/utils/cgroup/cpuset.py | 8 +- isolating_controller/workload.py | 8 +- swap_iso.py | 103 ++++++++++-------- 10 files changed, 114 insertions(+), 58 deletions(-) diff --git a/controller.py b/controller.py index aa7f828..0c4c782 100755 --- a/controller.py +++ b/controller.py @@ -143,7 +143,7 @@ def __init__(self, pending_queue: PendingQueue) -> None: def _isolate_workloads(self) -> None: logger = logging.getLogger(__name__) - # self._swapper.try_swap() + self._swapper.try_swap() for group, iteration_num in self._isolation_groups.items(): logger.info('') @@ -208,6 +208,7 @@ def _remove_ended_groups(self) -> None: logger.info(f'{group} of {ended_workload.name} is ended') # remove from containers + group.reset() del self._isolation_groups[group] def run(self) -> None: diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index ecf45d6..c1cb5f5 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -91,3 +91,10 @@ def decide_next_step(self) -> NextStep: def reset(self) -> None: """Restore to initial configuration""" pass + + def change_fg_wl(self, new_workload: Workload) -> None: + self._foreground_wl = new_workload + self._prev_metric_diff = new_workload.calc_metric_diff() + + def change_bg_wl(self, new_workload: Workload) -> None: + self._background_wl = new_workload diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index a7d94ff..5bdc64d 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -142,7 +142,7 @@ def _weaken_condition(self, fg_instruction_ps: float) -> NextStep: # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) # FIXME: hard coded (contiguous allocation) - if fg_instruction_ps > 0 and self._foreground_wl.orig_bound_cores[-1] < self._cur_fg_step: + if fg_instruction_ps > -.5 and self._foreground_wl.orig_bound_cores[-1] < self._cur_fg_step: self._fg_next_step = NextStep.STRENGTHEN else: self._fg_next_step = NextStep.IDLE @@ -159,7 +159,7 @@ def _strengthen_condition(self, fg_instruction_ps: float) -> NextStep: self._bg_next_step = NextStep.STRENGTHEN # FIXME: hard coded (contiguous allocation) - if fg_instruction_ps < 0 \ + if fg_instruction_ps < -.5 \ and (self._bg_next_step is NextStep.STRENGTHEN or self._cur_bg_step - self._cur_fg_step > 1): self._fg_next_step = NextStep.WEAKEN else: diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index 66f25e5..cb9e8bb 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -15,8 +15,6 @@ class MemoryIsolator(Isolator): def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) - self._orig_bg_affinity = background_wl.bound_cores - # FIXME: hard coded self._cur_step = DVFS.MAX @@ -87,4 +85,4 @@ def _monitoring_result(self) -> NextStep: return NextStep.STRENGTHEN def reset(self) -> None: - DVFS.set_freq(DVFS.MAX, self._orig_bg_affinity) + DVFS.set_freq(DVFS.MAX, self._background_wl.orig_bound_cores) diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index afad3c2..b65af06 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -1,7 +1,7 @@ # coding: UTF-8 import logging from abc import ABCMeta, abstractmethod -from typing import Mapping, Type +from typing import Dict, Type from .. import ResourceType from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator @@ -18,17 +18,22 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._fg_wl = fg_wl self._bg_wl = bg_wl - self._isolator_map: Mapping[Type[Isolator], Isolator] = dict() + self._isolator_map: Dict[Type[Isolator], Isolator] = dict() self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR self._aggr_ipc_diff: float = None def __hash__(self) -> int: - return self._fg_wl.pid + return id(self) def __repr__(self) -> str: return f'{self.__class__.__name__} ' + def __del__(self) -> None: + isolators = tuple(self._isolator_map.keys()) + for isolator in isolators: + del self._isolator_map[isolator] + def init_isolators(self) -> None: self._isolator_map = dict(( (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), @@ -78,10 +83,24 @@ def contentious_resource(self) -> ResourceType: def foreground_workload(self) -> Workload: return self._fg_wl + @foreground_workload.setter + def foreground_workload(self, new_workload: Workload): + self._fg_wl = new_workload + for isolator in self._isolator_map.values(): + isolator.change_fg_wl(new_workload) + isolator.enforce() + @property def background_workload(self) -> Workload: return self._bg_wl + @background_workload.setter + def background_workload(self, new_workload: Workload): + self._bg_wl = new_workload + for isolator in self._isolator_map.values(): + isolator.change_bg_wl(new_workload) + isolator.enforce() + @property def ended(self) -> bool: return not self._fg_wl.is_running or not self._bg_wl.is_running @@ -150,3 +169,7 @@ def update_aggr_instr(self) -> None: def set_idle_isolator(self) -> None: self._cur_isolator.yield_isolation() self._cur_isolator = IsolationPolicy._IDLE_ISOLATOR + + def reset(self) -> None: + for isolator in self._isolator_map.values(): + isolator.reset() diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 609009a..4c53520 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -45,7 +45,7 @@ def choose_next_isolator(self) -> bool: self._cur_isolator = self._isolator_map[CoreIsolator] self._cur_isolator._contentious_resource = ResourceType.MEMORY self._is_mem_isolated = False - logger.info(f'Cpuset Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') + logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') return True else: diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 057dcfd..36bd186 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -4,7 +4,7 @@ from .greedy_diff_policy import GreedyDiffPolicy from .. import ResourceType -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload @@ -22,7 +22,7 @@ def _check_violation(self) -> bool: return \ resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, SchedIsolator)) + and not isinstance(self._cur_isolator, CoreIsolator)) @property def new_isolator_needed(self) -> bool: diff --git a/isolating_controller/utils/cgroup/cpuset.py b/isolating_controller/utils/cgroup/cpuset.py index 22ba9b0..086e2c0 100644 --- a/isolating_controller/utils/cgroup/cpuset.py +++ b/isolating_controller/utils/cgroup/cpuset.py @@ -24,8 +24,12 @@ def set_memory_migrate(self, flag: bool) -> None: def read_cpus(self) -> Set[int]: cpus = subprocess.check_output(args=('cgget', '-nvr', 'cpuset.cpus', self._group_name), encoding='ASCII') + if cpus is '': + raise ProcessLookupError() return convert_to_set(cpus) def read_mems(self) -> Set[int]: - cpus = subprocess.check_output(args=('cgget', '-nvr', 'cpuset.mems', self._group_name), encoding='ASCII') - return convert_to_set(cpus) + mems = subprocess.check_output(args=('cgget', '-nvr', 'cpuset.mems', self._group_name), encoding='ASCII') + if mems is '': + raise ProcessLookupError() + return convert_to_set(mems) diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index de0124d..99eefe7 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -139,6 +139,12 @@ def cur_socket_id(self) -> int: # FIXME: hard coded if len(sockets) is not 1: - raise NotImplementedError('Workload spans multiple sockets.') + raise NotImplementedError(f'Workload spans multiple sockets. {sockets}') else: return next(iter(sockets)) + + def pause(self) -> None: + self._proc_info.suspend() + + def resume(self) -> None: + self._proc_info.resume() diff --git a/swap_iso.py b/swap_iso.py index 7634712..dbd9e78 100644 --- a/swap_iso.py +++ b/swap_iso.py @@ -1,10 +1,11 @@ # coding: UTF-8 import logging -import os -import signal +import subprocess from enum import IntEnum -from typing import Dict, Optional +from typing import Dict, Optional, Set + +import psutil from isolating_controller.isolation.policies.base_policy import IsolationPolicy from isolating_controller.workload import Workload @@ -19,10 +20,10 @@ class SwapIsolator: # FIXME: This threshold needs tests (How small diff is right for swapping workloads?) # "-0.5" means the IPCs of workloads in a group drop 50% compared to solo-run _IPC_DIFF_THRESHOLD = -0.5 + _VIOLATION_THRESHOLD = 5 def __init__(self, isolation_groups: Dict[IsolationPolicy, int]) -> None: """ - :param isolation_groups: Dict. Key is the index of group and Value is the group itself """ self._all_groups: Dict[IsolationPolicy, int] = isolation_groups @@ -31,8 +32,8 @@ def __init__(self, isolation_groups: Dict[IsolationPolicy, int]) -> None: self._most_cont_group: Optional[IsolationPolicy] = None self._least_cont_group: Optional[IsolationPolicy] = None - self._most_cont_workload: Optional[Workload] = None - self._least_cont_workload: Optional[Workload] = None + self._prev_wls: Set[Workload] = set() + self._violation_count: int = 0 def __del__(self): logger = logging.getLogger(__name__) @@ -62,57 +63,73 @@ def update_cont_group(self) -> None: self._most_cont_group = swap_out_grp self._least_cont_group = swap_in_grp - def choose_swap_candidates(self): - swap_out_grp = self._most_cont_group - swap_in_grp = self._least_cont_group - - # FIXME: This part depends on the swap policy (Which one is selected for swapping) - # TODO: Need Tests for Swap Overhead - swap_out_wl = swap_out_grp.least_mem_bw_workload - swap_in_wl = swap_in_grp.least_mem_bw_workload # It selects the bg workload in swap_in group - - self._swap_candidates[SwapNextStep.OUT] = swap_out_wl - self._swap_candidates[SwapNextStep.IN] = swap_in_wl - - def first_decision(self): - return - def swap_is_needed(self) -> bool: # FIXME: We used the average ipc diff value (We assume two workloads in a group at most) avg_min_ipc_diff = self._most_cont_group.aggr_ipc / 2 + print(avg_min_ipc_diff) # TODO: Test the _IPC_DIFF_THRESHOLD - if avg_min_ipc_diff < self._IPC_DIFF_THRESHOLD: - return True + if avg_min_ipc_diff > self._IPC_DIFF_THRESHOLD: + self._prev_wls.clear() + self._violation_count = 0 + return False + + if len(self._prev_wls) is 2 \ + and self._most_cont_group.background_workload in self._prev_wls \ + and self._least_cont_group.background_workload in self._prev_wls: + self._violation_count += 1 + print( + f'violation count of {self._most_cont_group.background_workload}, ' + f'{self._least_cont_group.background_workload} is {self._violation_count}') + return self._violation_count >= SwapIsolator._VIOLATION_THRESHOLD + else: + self._prev_wls.clear() + self._prev_wls.add(self._most_cont_group.background_workload) + self._prev_wls.add(self._least_cont_group.background_workload) + self._violation_count = 1 return False def do_swap(self) -> None: + self.pre_swap_setup() + # Enable CPUSET memory migration - out_wl = self._swap_candidates[SwapNextStep.OUT] - in_wl = self._swap_candidates[SwapNextStep.IN] + out_wl = self._most_cont_group.background_workload + in_wl = self._least_cont_group.background_workload + + print(f'swap {out_wl}, {in_wl}') - # Suspend Procs and Enforce Swap Conf. - os.kill(out_wl.pid, signal.SIGSTOP) - os.kill(in_wl.pid, signal.SIGSTOP) + try: + # Suspend Procs and Enforce Swap Conf. + out_wl.pause() + in_wl.pause() - out_cpus = out_wl.bound_cores - out_mems = out_wl.mems - in_cpus = in_wl.bound_cores - in_mems = in_wl.mems + in_tmp, out_tmp = in_wl.orig_bound_mems, out_wl.orig_bound_mems + in_wl.orig_bound_mems, out_wl.orig_bound_mems = out_tmp, in_tmp + in_tmp, out_tmp = in_wl.orig_bound_cores, out_wl.orig_bound_cores + in_wl.orig_bound_cores, out_wl.orig_bound_cores = out_tmp, in_tmp - out_wl.bound_cores = in_cpus - out_wl.mems = in_mems - in_wl.bound_cores = out_cpus - in_wl.mems = out_mems + in_tmp, out_tmp = in_wl.bound_mems, out_wl.bound_mems + in_wl.bound_mems, out_wl.bound_mems = out_tmp, in_tmp + in_tmp, out_tmp = in_wl.bound_cores, out_wl.bound_cores + in_wl.bound_cores, out_wl.bound_cores = out_tmp, in_tmp - # Resume Procs - os.kill(out_wl.pid, signal.SIGCONT) - os.kill(in_wl.pid, signal.SIGCONT) + self._most_cont_group.background_workload = in_wl + self._least_cont_group.background_workload = out_wl + + except (psutil.NoSuchProcess, subprocess.CalledProcessError, ProcessLookupError) as e: + print(e) + + finally: + # Resume Procs + out_wl.resume() + in_wl.resume() + self._violation_count = 0 + self._prev_wls.clear() def pre_swap_setup(self) -> None: - swap_out_workload = self._swap_candidates[SwapNextStep.OUT] - swap_in_workload = self._swap_candidates[SwapNextStep.IN] + swap_out_workload = self._most_cont_group.background_workload + swap_in_workload = self._least_cont_group.background_workload swap_out_workload.cgroup_cpuset.set_memory_migrate(True) swap_in_workload.cgroup_cpuset.set_memory_migrate(True) @@ -122,6 +139,6 @@ def try_swap(self) -> None: return self.update_cont_group() - self.choose_swap_candidates() - if self.swap_is_needed: + + if self.swap_is_needed(): self.do_swap() From c9f67d43e1118f047257c2439e4e7dc7f1d5071f Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 30 Sep 2018 14:49:16 +0900 Subject: [PATCH 28/82] update and add solorun data --- .../solorun_data/facesim.json | 1 + .../solorun_data/particlefilter.json | 1 + solorun_data/8core/facesim.json | 15 +++++++++++++ solorun_data/8core/freqmine.json | 22 +++++++++---------- solorun_data/8core/particlefilter.json | 15 +++++++++++++ 5 files changed, 43 insertions(+), 11 deletions(-) create mode 120000 isolating_controller/solorun_data/facesim.json create mode 120000 isolating_controller/solorun_data/particlefilter.json create mode 100644 solorun_data/8core/facesim.json create mode 100644 solorun_data/8core/particlefilter.json diff --git a/isolating_controller/solorun_data/facesim.json b/isolating_controller/solorun_data/facesim.json new file mode 120000 index 0000000..0239831 --- /dev/null +++ b/isolating_controller/solorun_data/facesim.json @@ -0,0 +1 @@ +../../solorun_data/8core/facesim.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/particlefilter.json b/isolating_controller/solorun_data/particlefilter.json new file mode 120000 index 0000000..4c464ab --- /dev/null +++ b/isolating_controller/solorun_data/particlefilter.json @@ -0,0 +1 @@ +../../solorun_data/8core/particlefilter.json \ No newline at end of file diff --git a/solorun_data/8core/facesim.json b/solorun_data/8core/facesim.json new file mode 100644 index 0000000..4c39ab1 --- /dev/null +++ b/solorun_data/8core/facesim.json @@ -0,0 +1,15 @@ +{ + "name": "facesim", + "runtime": 74.43683433532715, + "l2miss": 195316036.70765027, + "l3miss": 29172485.150273222, + "instructions": 31213504312.2541, + "cycles": 16004271485.874317, + "stall_cycles": 3584421050.068306, + "wall_cycles": 2102741078.6202188, + "intra_coh": 3020455.9972677594, + "inter_coh": 10.0, + "llc_size": 41439544.54618474, + "local_mem": 1403864382.9508197, + "remote_mem": 9712220.327868853 +} \ No newline at end of file diff --git a/solorun_data/8core/freqmine.json b/solorun_data/8core/freqmine.json index 9780603..bb3d798 100644 --- a/solorun_data/8core/freqmine.json +++ b/solorun_data/8core/freqmine.json @@ -1,15 +1,15 @@ { "name": "freqmine", - "runtime": 69.44226408004761, - "l2miss": 68033447.65306123, - "l3miss": 10817418.483965015, - "instructions": 35975762909.854225, - "cycles": 19932432780.072887, - "stall_cycles": 4954574076.209912, - "wall_cycles": 2100459532.9446065, - "intra_coh": 17507614.15451895, - "inter_coh": 0.29154518950437314, + "runtime": 85.35708165168762, + "l2miss": 58436243.56506239, + "l3miss": 8935877.920380274, + "instructions": 29329754498.395725, + "cycles": 16095461475.448605, + "stall_cycles": 3889697179.524658, + "wall_cycles": 2106684766.0249555, + "intra_coh": 16015597.219251337, + "inter_coh": 0.7961972667855021, "llc_size": 385291493.877551, - "local_mem": 577093202.0991254, - "remote_mem": 3535695.860058309 + "local_mem": 665578631.681521, + "remote_mem": 6117472.133095663 } \ No newline at end of file diff --git a/solorun_data/8core/particlefilter.json b/solorun_data/8core/particlefilter.json new file mode 100644 index 0000000..c837ebd --- /dev/null +++ b/solorun_data/8core/particlefilter.json @@ -0,0 +1,15 @@ +{ + "name": "particlefilter", + "runtime": 78.08439254760742, + "l2miss": 1892421869.2578125, + "l3miss": 781396.8098958333, + "instructions": 29723062949.791664, + "cycles": 11968647735.755207, + "stall_cycles": 1535074183.3203125, + "wall_cycles": 2102764573.671875, + "intra_coh": 2694.4921875, + "inter_coh": 0.078125, + "llc_size": 41439544.54618474, + "local_mem": 53071360.0, + "remote_mem": 628906.6666666666 +} \ No newline at end of file From 5e9c2a0b69dfa90c634330311a934eb9fdfbd8fe Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Sun, 30 Sep 2018 23:44:45 +0900 Subject: [PATCH 29/82] fix: Fix some minor typos and add related functions --- .../isolation/isolators/core.py | 20 ++++++++++++++++ .../isolation/policies/base_policy.py | 23 ++++++++++--------- .../isolation/policies/greedy_diff_policy.py | 4 ++-- .../greedy_diff_with_violation_policy.py | 7 +++--- .../metric_container/basic_metric.py | 2 +- isolating_controller/workload.py | 10 +++++--- 6 files changed, 46 insertions(+), 20 deletions(-) diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 5bdc64d..0e152d3 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -175,3 +175,23 @@ def reset(self) -> None: self._background_wl.bound_cores = self._background_wl.orig_bound_cores if self._foreground_wl.is_running: self._foreground_wl.bound_cores = self._foreground_wl.orig_bound_cores + + @staticmethod + def _is_more_core_benefit(wl: Workload) -> bool: + wl_threads = wl.number_of_threads + wl_cpus= len(wl.cgroup_cpuset.read_cpus()) + print(f'{wl.wl_type}, {wl.name}, threads : {wl_threads}, len(cpuset): {wl_cpus}') + if wl_threads > wl_cpus: + return True + else: + return False + + @staticmethod + def _is_less_core_benefit(wl: Workload) -> bool: + wl_threads = wl.number_of_threads + wl_cpus= len(wl.cgroup_cpuset.read_cpus()) + print(f'{wl.wl_type}, {wl.name}, threads : {wl_threads}, len(cpuset): {wl_cpus}') + if wl_threads < wl_cpus: + return True + else: + return False diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index b65af06..e0f2cfc 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -4,7 +4,7 @@ from typing import Dict, Type from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator, SchedIsolator from ...metric_container.basic_metric import BasicMetric, MetricDiff from ...workload import Workload @@ -21,7 +21,7 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._isolator_map: Dict[Type[Isolator], Isolator] = dict() self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR - self._aggr_ipc_diff: float = None + self._aggr_inst_diff: float = None def __hash__(self) -> int: return id(self) @@ -38,7 +38,8 @@ def init_isolators(self) -> None: self._isolator_map = dict(( (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), - (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)) + (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)), + (SchedIsolator, SchedIsolator(self._fg_wl, self._bg_wl)) )) @property @@ -114,19 +115,19 @@ def name(self) -> str: return f'{self._fg_wl.name}({self._fg_wl.pid})' @property - def aggr_ipc(self) -> float: - return self._aggr_ipc_diff + def aggr_inst(self) -> float: + return self._aggr_inst_diff @property def most_cont_workload(self) -> Workload: fg_wl = self.foreground_workload bg_wl = self.background_workload - fg_ipc_diff = fg_wl.ipc_diff - bg_ipc_diff = bg_wl.ipc_diff + fg_inst_diff = fg_wl.inst_diff + bg_inst_diff = bg_wl.inst_diff # FIXME: Below condition is likely to fail due to too little differences between fg and bg - if fg_ipc_diff < bg_ipc_diff: + if fg_inst_diff < bg_inst_diff: return fg_wl else: return bg_wl @@ -136,8 +137,8 @@ def least_cont_workload(self) -> Workload: fg_wl = self.foreground_workload bg_wl = self.background_workload - fg_ipc_diff = fg_wl.ipc_diff - bg_ipc_diff = bg_wl.ipc_diff + fg_ipc_diff = fg_wl.inst_diff + bg_ipc_diff = bg_wl.inst_diff # FIXME: Below condition is likely to fail due to too little differences between fg and bg if fg_ipc_diff > bg_ipc_diff: @@ -164,7 +165,7 @@ def update_aggr_instr(self) -> None: bg_diff = self._bg_wl.calc_metric_diff() self._fg_wl._ipc_diff = fg_diff.instruction_ps self._bg_wl._ipc_diff = bg_diff.instruction_ps - self._aggr_ipc_diff = fg_diff.instruction_ps + bg_diff.instruction_ps + self._aggr_inst_diff = fg_diff.instruction_ps + bg_diff.instruction_ps def set_idle_isolator(self) -> None: self._cur_isolator.yield_isolation() diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 4c53520..43dbf2a 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -4,7 +4,7 @@ from .base_policy import IsolationPolicy from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload @@ -42,7 +42,7 @@ def choose_next_isolator(self) -> bool: return True elif resource is ResourceType.MEMORY: - self._cur_isolator = self._isolator_map[CoreIsolator] + self._cur_isolator = self._isolator_map[SchedIsolator] self._cur_isolator._contentious_resource = ResourceType.MEMORY self._is_mem_isolated = False logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 36bd186..2912119 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -4,7 +4,7 @@ from .greedy_diff_policy import GreedyDiffPolicy from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload @@ -20,9 +20,10 @@ def _check_violation(self) -> bool: resource: ResourceType = self.contentious_resource() return \ - resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ + resource is ResourceType.CPU and not isinstance(self._cur_isolator, CoreIsolator) \ + or resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, CoreIsolator)) + and not isinstance(self._cur_isolator, SchedIsolator)) @property def new_isolator_needed(self) -> bool: diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index bcedcb9..3754628 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -146,4 +146,4 @@ def instruction_ps(self) -> float: def __repr__(self) -> str: return f'L3 hit ratio diff: {self._l3_hit_ratio:>6.03f}, ' \ f'Local Memory access diff: {self._local_mem_ps:>6.03f}, ' \ - f'instruction diff: {self._instruction_ps:>6.03f}' + f'Instructions per sec. diff: {self._instruction_ps:>6.03f}' diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 99eefe7..84c500e 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -28,7 +28,7 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._perf_interval = perf_interval self._proc_info = psutil.Process(pid) - self._ipc_diff: float = None + self._inst_diff: float = None self._cgroup_cpuset = CpuSet(self.group_name) self._cgroup_cpu = Cpu(self.group_name) @@ -112,13 +112,17 @@ def is_running(self) -> bool: return self._proc_info.is_running() @property - def ipc_diff(self) -> float: - return self._ipc_diff + def inst_diff(self) -> float: + return self._inst_diff @property def group_name(self) -> str: return f'{self.name}_{self.pid}' + @property + def number_of_threads(self) -> int: + return self._proc_info.num_threads() + def calc_metric_diff(self) -> MetricDiff: solorun_data = data_map[self.name] curr_metric: BasicMetric = self._metrics[0] From 99e08f4d9e7463079b5ef36646dffbb0c1e8b85e Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Tue, 2 Oct 2018 14:17:34 +0900 Subject: [PATCH 30/82] feat: Add proc_arbitrator.py --- proc_arbitrator.py | 185 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 proc_arbitrator.py diff --git a/proc_arbitrator.py b/proc_arbitrator.py new file mode 100644 index 0000000..cc1a4b5 --- /dev/null +++ b/proc_arbitrator.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python +# coding=UTF-8 + +from __future__ import print_function, division + +import multiprocessing +import os +import sys +import time +from signal import SIGSTOP, SIGCONT +from threading import Timer + +import psutil + +dead_status = (psutil.STATUS_DEAD, psutil.STATUS_ZOMBIE) + + +class ProcessArbitrator: + def __init__(self, pids, time_slice=50, iteration_limit=None): + """ + 생성자 + + Args: + pids (list of tuple): (실행할 process의 pid, perf의 pid)들의 list + time_slice (int): 한 process가 실행될 interval (ms 단위). 기본값은 500 + iteration_limit (int): 각 pid들의 반복횟수를 제한. `None` 으로하면 무제한. 기본값은 `None` + + Raises: + ValueError: pid의 타입이 이상할 경우 + + Notes + `time_slice` 가 0일경우 제대로 동작안함 (중요하지 않아보여서 처리하지 않음) + """ + if not pids: + raise ValueError('`pids` cannot be `None`') + + self._iteration_limit = iteration_limit + self._time_slice = time_slice + self._all_pids = list(pids) + self._remain_pids = list(pids) + self.next_proc() + + def next_proc(self): + # self._print_status() + + self._stop_all() + # print 'all process stopped' + + next_pid = self.pick_next_proc() + + if not next_pid: + # print 'no more process to run' + return + + # print 'next process is : ' + str(next_pid) + if next_pid[0] is not None: + os.kill(next_pid[0], SIGCONT) + + if next_pid[1] is not None: + os.kill(next_pid[1], SIGCONT) + + Timer(self._time_slice / 1000, self.next_proc).start() + + def pick_next_proc(self): + """ + `ProcessArbitrator` 에 포함된 process중에서 다음 time slice때 실행될 process의 pid를 구한다. + 더이상 실행할 process가 없을 때, 혹은 `iteration_limit` 에 도달했을때 `None` 을 반환한다. + + Returns: + tuple of int: 다음 time slice에 실행할 process의 pid + """ + while True: + if len(self._remain_pids) is 0: + if len(self._all_pids) is 0: + return None + + elif self._iteration_limit is 1: + self._resume_all() + return None + + else: + self._remain_pids.extend(self._all_pids) + if self._iteration_limit: + self._iteration_limit -= 1 + + next_pid = self._remain_pids.pop() + + is_ps1_dead = False + is_ps2_dead = False + + try: + if psutil.Process(next_pid[0]).status() in dead_status: + is_ps1_dead = True + except psutil.NoSuchProcess: + is_ps1_dead = True + + try: + if psutil.Process(next_pid[1]).status() in dead_status: + is_ps2_dead = True + except psutil.NoSuchProcess: + is_ps2_dead = True + + if is_ps1_dead and not is_ps2_dead: + return None, next_pid[1] + + elif not is_ps1_dead and is_ps2_dead: + return next_pid[0], None + + elif not is_ps1_dead and not is_ps2_dead: + return next_pid + + else: + self._all_pids.remove(next_pid) + + def set_time_slice(self, time_slice): + self._time_slice = time_slice + + def _stop_all(self): + try: + for pid in self._all_pids: + os.kill(pid[0], SIGSTOP) + os.kill(pid[1], SIGSTOP) + except: + pass + + def _resume_all(self): + try: + for pid in self._all_pids: + os.kill(pid[0], SIGCONT) + os.kill(pid[1], SIGCONT) + except: + pass + + def _print_status(self): + for pid in self._all_pids: + try: + process = psutil.Process(pid[0]) + sys.stdout.write(str(process.pid) + ':' + process.status() + ', ') + except psutil.NoSuchProcess: + pass + print() + + +def main(): + num = 4 + from datetime import datetime + + def test_thread(name): + for i in range(num): + time.sleep(1) + sys.stderr.write('{}\t{}({})\t{}\n'.format(datetime.now(), name, os.getpid(), i)) + + return + + processes = [] + + try: + proc_num = 2 + + pids = [] + + for n in range(proc_num): + process = multiprocessing.Process(target=test_thread, args=('process #' + str(n),)) + process.start() + process2 = multiprocessing.Process(target=test_thread, args=('process #' + str(n) + '\'s sidekick',)) + process2.start() + pids.append((process.pid, process2.pid)) + + processes.append(process) + processes.append(process2) + + ProcessArbitrator(pids, 50) + + for process in processes: + print('start to join {0}'.format(process.pid)) + process.join() + print('end of {0}'.format(process.pid)) + + except KeyboardInterrupt: + for process in processes: + process.terminate() + + +if __name__ == '__main__': + main() From 6b44b2a42d9c601b370d7a735cee1d4cf16f225b Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Thu, 4 Oct 2018 22:01:19 +0900 Subject: [PATCH 31/82] feat: Add online profiling code --- controller.py | 92 ++++++++++++++++++- .../isolation/isolators/core.py | 51 +++++++--- .../isolation/policies/base_policy.py | 5 +- .../isolation/policies/greedy_diff_policy.py | 14 ++- .../greedy_diff_with_violation_policy.py | 2 +- .../metric_container/basic_metric.py | 40 +++++++- isolating_controller/workload.py | 63 ++++++++++++- 7 files changed, 238 insertions(+), 29 deletions(-) diff --git a/controller.py b/controller.py index 0c4c782..9a71368 100755 --- a/controller.py +++ b/controller.py @@ -25,6 +25,7 @@ from isolating_controller.workload import Workload from pending_queue import PendingQueue from swap_iso import SwapIsolator +from proc_arbitrator import ProcessArbitrator MIN_PYTHON = (3, 6) @@ -102,7 +103,17 @@ def _cbk_wl_monitor(self, workload: Workload, logger = logging.getLogger(f'monitoring.metric.{workload}') logger.debug(f'{metric} is given from ') - metric_que = workload.metrics + metric_que = None + if not workload.profile_solorun: + logger.debug(f'Metric_queue : workload.metrics') + # TODO: Do we need clear()? + metric_que = workload.metrics + elif workload.profile_solorun: + logger.debug(f'Metric_queue : workload.profile_solorun') + # init the solorun_data_queue + workload.solorun_data_queue.clear() + # suspend ALL BGs in the same socket + metric_que = workload.solorun_data_queue if len(metric_que) == self._metric_buf_size: metric_que.pop() @@ -135,7 +146,9 @@ def __init__(self, pending_queue: PendingQueue) -> None: self._pending_queue: PendingQueue = pending_queue - self._interval: float = 0.2 # Scheduling interval + self._interval: float = 0.2 # scheduling interval (sec) + self._profile_interval: float = 1.0 # check interval for phase change (sec) + self._solorun_interval: float = 2.0 # the FG's solorun profiling interval (sec) self._isolation_groups: Dict[IsolationPolicy, int] = dict() # Swapper init self._swapper: SwapIsolator = SwapIsolator(self._isolation_groups) @@ -211,16 +224,84 @@ def _remove_ended_groups(self) -> None: group.reset() del self._isolation_groups[group] + def _profile_solorun(self) -> None: + """ + profile solorun status of a workload + :return: + """ + all_fg_wls = list() + all_bg_wls = list() + # suspend all workloads and their perf agents + for group in self._isolation_groups: + fg_wl = group.foreground_workload + bg_wl = group.background_workload + fg_wl.pause() + fg_wl.pause_perf() + bg_wl.pause() + bg_wl.pause_perf() + all_fg_wls.append(fg_wl) + all_bg_wls.append(bg_wl) + + # run FG workloads alone + for fg_wl in all_fg_wls: + fg_wl.profile_solorun = True + fg_wl.resume() + fg_wl.resume_perf() + + # four seconds for monitoring solo-run + time.sleep(self._solorun_interval) + + # disable solorun mode + for fg_wl in all_fg_wls: + fg_wl.profile_solorun = False + + # resume BG workloads + for bg_wl in all_bg_wls: + bg_wl.resume() + bg_wl.resume_perf() + + def _update_all_workloads_num_threads(self): + """ + update the workloads' number of threads (cur_num_threads -> prev_num_threads) + :return: + """ + for group in self._isolation_groups: + bg_wl = group.background_workload + fg_wl = group.foreground_workload + bg_wl.update_num_threads() + fg_wl.update_num_threads() + + def _profile_needed(self, count: int) -> bool: + """ + This function checks if the profiling procedure should be called + + profile_freq : the frequencies of online profiling + :param count: This counts the number of entering the run func. loop + :return: Decision whether to initiate online solorun profiling + """ + + profile_freq = int(self._profile_interval/self._interval) + for group in self._isolation_groups: + fg_wl = group.foreground_workload + if count % profile_freq != 0 and fg_wl.is_num_threads_changed(): + self._update_all_workloads_num_threads() + return False + else: + self._update_all_workloads_num_threads() + return True + def run(self) -> None: logger = logging.getLogger(__name__) logger.info('starting isolation loop') - + count = 0 while True: self._remove_ended_groups() self._register_pending_workloads() time.sleep(self._interval) - + count += 1 + if self._profile_needed(count): + self._profile_solorun() self._isolate_workloads() @@ -231,7 +312,8 @@ def main() -> None: args = parser.parse_args() - stream_handler = logging.StreamHandler() + #stream_handler = logging.StreamHandler() + stream_handler = logging.FileHandler('debug.log') stream_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s]: %(message)s')) controller_logger = logging.getLogger(__name__) diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 0e152d3..919f855 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -10,6 +10,7 @@ class CoreIsolator(Isolator): _DOD_THRESHOLD = 0.005 _FORCE_THRESHOLD = 0.1 + _INST_PS_THRESHOLD = -0.5 def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) @@ -135,14 +136,25 @@ def _monitoring_result(self) -> NextStep: return self._strengthen_condition(metric_diff.instruction_ps) def _weaken_condition(self, fg_instruction_ps: float) -> NextStep: - if self._cur_bg_step == self._background_wl.orig_bound_cores[0]: - self._bg_next_step = NextStep.IDLE - else: - self._bg_next_step = NextStep.WEAKEN + fg_not_used_cores = len(self._foreground_wl.bound_cores) - self._foreground_wl.number_of_threads + # BG Next Step Decision + # ResourceType.CPU - If FG workload not fully use all its assigned cores..., then BG can weaken! + if self._contentious_resource == ResourceType.CPU: + if fg_not_used_cores == 0: + self._bg_next_step = NextStep.IDLE + elif fg_not_used_cores > 0: + self._bg_next_step = NextStep.WEAKEN + # ResourceType.MEMORY - If BG workload was strengthened than its assigned cores, then BG can weaken! + elif self._contentious_resource == ResourceType.MEMORY: + if self._cur_bg_step == self._background_wl.orig_bound_cores[0]: + self._bg_next_step = NextStep.IDLE + else: + self._bg_next_step = NextStep.WEAKEN # FIXME: Specifying fg's strengthen/weaken condition (related to fg's performance) # FIXME: hard coded (contiguous allocation) - if fg_instruction_ps > -.5 and self._foreground_wl.orig_bound_cores[-1] < self._cur_fg_step: + # FG Next Step Decision + if fg_instruction_ps > self._INST_PS_THRESHOLD and self._foreground_wl.orig_bound_cores[-1] < self._cur_fg_step: self._fg_next_step = NextStep.STRENGTHEN else: self._fg_next_step = NextStep.IDLE @@ -153,14 +165,31 @@ def _weaken_condition(self, fg_instruction_ps: float) -> NextStep: return NextStep.WEAKEN def _strengthen_condition(self, fg_instruction_ps: float) -> NextStep: - if self._cur_bg_step == self._background_wl.orig_bound_cores[-1]: - self._bg_next_step = NextStep.IDLE - else: - self._bg_next_step = NextStep.STRENGTHEN + logger = logging.getLogger(__name__) + + # BG Next Step Decision + # ResourceType.CPU - If FG workload shows low performance and FG's threads are larger than its assigned cores, + # then BG can strengthen! + if self._contentious_resource == ResourceType.CPU: + if fg_instruction_ps > self._INST_PS_THRESHOLD: + self._bg_next_step = NextStep.IDLE + elif fg_instruction_ps <= self._INST_PS_THRESHOLD and \ + self._foreground_wl.number_of_threads > len(self._foreground_wl.bound_cores): + self._bg_next_step = NextStep.STRENGTHEN + # ResourceType.MEMORY - If BG workload can strengthen its cores... , then strengthen BG's cores! + elif self._contentious_resource == ResourceType.MEMORY: + if self._cur_bg_step == self._background_wl.orig_bound_cores[-1]: + self._bg_next_step = NextStep.IDLE + else: + self._bg_next_step = NextStep.STRENGTHEN # FIXME: hard coded (contiguous allocation) - if fg_instruction_ps < -.5 \ - and (self._bg_next_step is NextStep.STRENGTHEN or self._cur_bg_step - self._cur_fg_step > 1): + # FG Next Step Decision + logger.debug(f'FG threads: {self._foreground_wl.number_of_threads}, ' + f'orig_bound_cores: {self._foreground_wl.orig_bound_cores}') + if fg_instruction_ps < self._INST_PS_THRESHOLD \ + and (self._bg_next_step is NextStep.STRENGTHEN or self._cur_bg_step - self._cur_fg_step > 1) \ + and self._foreground_wl.number_of_threads > len(self._foreground_wl.orig_bound_cores): self._fg_next_step = NextStep.WEAKEN else: self._fg_next_step = NextStep.IDLE diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index e0f2cfc..de2c6c9 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -4,7 +4,7 @@ from typing import Dict, Type from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator from ...metric_container.basic_metric import BasicMetric, MetricDiff from ...workload import Workload @@ -39,7 +39,6 @@ def init_isolators(self) -> None: (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)), - (SchedIsolator, SchedIsolator(self._fg_wl, self._bg_wl)) )) @property @@ -57,7 +56,7 @@ def contentious_resource(self) -> ResourceType: logger = logging.getLogger(__name__) logger.info(repr(metric_diff)) - logger.info(f'l3_int: {cur_metric.l3_intensity}, mem_int: {cur_metric.mem_intensity}') + logger.info(f'l3_int: {cur_metric.l3_intensity}, mem_int: {cur_metric.mem_intensity}, llc_util: {cur_metric.l3_util}') if abs(cur_metric.l3_intensity) < IsolationPolicy._CPU_THRESHOLD \ and abs(cur_metric.mem_intensity) < IsolationPolicy._CPU_THRESHOLD: return ResourceType.CPU diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 43dbf2a..096b822 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -27,25 +27,29 @@ def choose_next_isolator(self) -> bool: if resource is ResourceType.CPU: self._cur_isolator = self._isolator_map[CoreIsolator] self._cur_isolator._contentious_resource = ResourceType.CPU - logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') + #logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') + logger.info(f'Resource Type: {ResourceType.CPU.name}, CoreIsolation') return True elif resource is ResourceType.CACHE: self._cur_isolator = self._isolator_map[CacheIsolator] - logger.info(f'Cache Isolation for {self._fg_wl} is started') + #logger.info(f'Cache Isolation for {self._fg_wl} is started') + logger.info(f'Resource Type: {ResourceType.CACHE.name}, CacheIsolation') return True elif not self._is_mem_isolated and resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[MemoryIsolator] self._is_mem_isolated = True - logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started') + #logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started') + logger.info(f'Resource Type: {ResourceType.MEMORY.name}, MemoryIsolation') return True elif resource is ResourceType.MEMORY: - self._cur_isolator = self._isolator_map[SchedIsolator] + self._cur_isolator = self._isolator_map[CoreIsolator] self._cur_isolator._contentious_resource = ResourceType.MEMORY self._is_mem_isolated = False - logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') + #logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') + logger.info(f'Resource Type: {ResourceType.MEMORY.name}, CoreIsolation') return True else: diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 2912119..fd240b5 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -23,7 +23,7 @@ def _check_violation(self) -> bool: resource is ResourceType.CPU and not isinstance(self._cur_isolator, CoreIsolator) \ or resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, SchedIsolator)) + and not isinstance(self._cur_isolator, CoreIsolator)) @property def new_isolator_needed(self) -> bool: diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 3754628..8e2d28f 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -1,15 +1,18 @@ # coding: UTF-8 +from __future__ import division + from time import localtime, strftime from cpuinfo import cpuinfo + LLC_SIZE = int(cpuinfo.get_cpu_info()['l3_cache_size'].split()[0]) * 1024 class BasicMetric: - def __init__(self, l2miss, l3miss, inst, cycles, stall_cycles, wall_cycles, intra_coh, inter_coh, llc_size, - local_mem, remote_mem, interval: int): + def __init__(self, l2miss=0, l3miss=0, inst=0, cycles=0, stall_cycles=0, wall_cycles=0, intra_coh=0, + inter_coh=0, llc_size=0, local_mem=0, remote_mem=0, interval: int=1000): self._l2miss = l2miss self._l3miss = l3miss self._instructions = inst @@ -40,6 +43,10 @@ def instruction(self): def instruction_ps(self): return self._instructions * (1000 / self._interval) + @property + def wall_cycles(self): + return self._wall_cycles + @property def cycles(self): return self._cycles @@ -114,11 +121,40 @@ def l3_intensity(self) -> float: def mem_intensity(self) -> float: return self.llc_util * self.l3miss_ratio + @property + def l3_util(self) -> float: + return self.llc_util + def __repr__(self) -> str: return ', '.join(map(str, ( self._l2miss, self._l3miss, self._instructions, self._cycles, self._stall_cycles, self._intra_coh, self._inter_coh, self._llc_size, self._req_date))) + def __add__(self, others): + self._l2miss = self.l2miss + others.l2miss + self._l3miss = self.l3miss + others.l3miss + self._instructions = self.instruction + others.instruction + self._wall_cycles = self.wall_cycles + others.wall_cycles + self._stall_cycles = self.stall_cycle + others.stall_cycle + self._intra_coh = self.intra_coh + others.intra_coh + self._inter_coh = self.inter_coh + others.inter_coh + self._llc_size = self.llc_size + others.llc_size + self._local_mem = self.local_mem + others.local_mem + self._remote_mem = self.remote_mem + others.remote_mem + + def __truediv__(self, other): + self._l2miss /= other + self._l3miss /= other + self._instructions /= other + self._wall_cycles /= other + self._cycles /= other + self._stall_cycles /= other + self._intra_coh /= other + self._inter_coh /= other + self._llc_size /= other + self._local_mem /= other + self._remote_mem /= other + class MetricDiff: def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 84c500e..da03f6f 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -5,6 +5,7 @@ from typing import Deque, Iterable, Set, Tuple import psutil +import logging from .metric_container.basic_metric import BasicMetric, MetricDiff from .solorun_data.datas import data_map @@ -28,12 +29,18 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._perf_interval = perf_interval self._proc_info = psutil.Process(pid) + self._perf_info = psutil.Process(perf_pid) self._inst_diff: float = None self._cgroup_cpuset = CpuSet(self.group_name) self._cgroup_cpu = Cpu(self.group_name) self._resctrl = ResCtrl(self.group_name) + self._profile_solorun: bool = False + self._solorun_data_queue: Deque[BasicMetric] = deque() # This queue is used to collect and calculate avg. status + self._avg_solorun_data: BasicMetric = None # This variable is used to contain the recent avg. status + self._prev_num_threads: int = None + self._orig_bound_cores: Tuple[int, ...] = tuple(self._cgroup_cpuset.read_cpus()) self._orig_bound_mems: Set[int] = self._cgroup_cpuset.read_mems() @@ -123,10 +130,45 @@ def group_name(self) -> str: def number_of_threads(self) -> int: return self._proc_info.num_threads() + @property + def prev_num_threads(self) -> int: + return self._prev_num_threads + + def update_num_threads(self) -> None: + self._prev_num_threads = self._proc_info.num_threads() + + @property + def profile_solorun(self) -> bool: + return self._profile_solorun + + @profile_solorun.setter + def profile_solorun(self, new_flag: bool) -> None: + self._profile_solorun = new_flag + + @property + def solorun_data_queue(self) -> Deque[BasicMetric]: + return self._solorun_data_queue + + @property + def avg_solorun_data(self) -> BasicMetric: + return self._avg_solorun_data + + def calc_avg_solorun(self) -> None: + counts = 0 + sum_of_items = BasicMetric() + for item in self.solorun_data_queue: + sum_of_items += item + counts += 1 + self._avg_solorun_data = sum_of_items / counts + def calc_metric_diff(self) -> MetricDiff: - solorun_data = data_map[self.name] + logger=logging.getLogger(__name__) + #solorun_data = data_map[self.name] + if self._avg_solorun_data is not None: + solorun_data = self._avg_solorun_data + else: + solorun_data = data_map[self.name] curr_metric: BasicMetric = self._metrics[0] - return MetricDiff(curr_metric, solorun_data) def all_child_tid(self) -> Tuple[int, ...]: @@ -152,3 +194,20 @@ def pause(self) -> None: def resume(self) -> None: self._proc_info.resume() + + def pause_perf(self) -> None: + self._perf_info.suspend() + + def resume_perf(self) -> None: + self._perf_info.resume() + + def is_num_threads_changed(self) -> bool: + """ + Detecting the phase changes based on the changes in the number of threads + :return: + """ + cur_num_threads = self.number_of_threads + if self._prev_num_threads == cur_num_threads: + return False + else: + return True From 4a00cd8c2ed9a967a4e6f64fe8824991fe74b587 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Mon, 8 Oct 2018 01:06:05 +0900 Subject: [PATCH 32/82] feat: Add storing and loading each isolation configs for profiling solorun --- controller.py | 26 ++-- .../isolation/isolators/base_isolator.py | 10 ++ .../isolation/isolators/cache.py | 14 ++- .../isolation/isolators/core.py | 17 ++- .../isolation/isolators/idle.py | 3 + .../isolation/isolators/memory.py | 12 ++ .../isolation/policies/base_policy.py | 118 +++++++++++++++++- isolating_controller/utils/cgroup/cpuset.py | 13 ++ isolating_controller/utils/dvfs.py | 40 +++++- isolating_controller/utils/resctrl.py | 37 ++++++ isolating_controller/workload.py | 7 +- 11 files changed, 282 insertions(+), 15 deletions(-) diff --git a/controller.py b/controller.py index 9a71368..919008e 100755 --- a/controller.py +++ b/controller.py @@ -25,7 +25,6 @@ from isolating_controller.workload import Workload from pending_queue import PendingQueue from swap_iso import SwapIsolator -from proc_arbitrator import ProcessArbitrator MIN_PYTHON = (3, 6) @@ -147,8 +146,9 @@ def __init__(self, pending_queue: PendingQueue) -> None: self._pending_queue: PendingQueue = pending_queue self._interval: float = 0.2 # scheduling interval (sec) - self._profile_interval: float = 1.0 # check interval for phase change (sec) - self._solorun_interval: float = 2.0 # the FG's solorun profiling interval (sec) + self._count: int = 0 # scheduling counts + self._profile_interval: float = 1.0 # check interval for phase change (sec) + self._solorun_interval: float = 2.0 # the FG's solorun profiling interval (sec) self._isolation_groups: Dict[IsolationPolicy, int] = dict() # Swapper init self._swapper: SwapIsolator = SwapIsolator(self._isolation_groups) @@ -159,6 +159,16 @@ def _isolate_workloads(self) -> None: self._swapper.try_swap() for group, iteration_num in self._isolation_groups.items(): + if group.profile_needed(self._profile_interval, self._interval, self._count): + group.store_cur_configs() + group.profile_solorun() + group.profile_stop_cond = self._count + int(self._solorun_interval/self._interval) + elif group.foreground_workload.profile_solorun is True and self._count > group.profile_stop_cond: + group.all_workload_pause() + group.foreground_workload.profile_solorun = False + group.reset_stored_configs() + group.all_workload_resume() + logger.info('') logger.info(f'***************isolation of {group.name} #{iteration_num}***************') @@ -293,15 +303,15 @@ def _profile_needed(self, count: int) -> bool: def run(self) -> None: logger = logging.getLogger(__name__) logger.info('starting isolation loop') - count = 0 + # count = 0 while True: self._remove_ended_groups() self._register_pending_workloads() time.sleep(self._interval) - count += 1 - if self._profile_needed(count): - self._profile_solorun() + # count += 1 + # if self._profile_needed(count): + # self._profile_solorun() self._isolate_workloads() @@ -312,7 +322,7 @@ def main() -> None: args = parser.parse_args() - #stream_handler = logging.StreamHandler() + # stream_handler = logging.StreamHandler() stream_handler = logging.FileHandler('debug.log') stream_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s]: %(message)s')) diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index c1cb5f5..d83da2f 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -98,3 +98,13 @@ def change_fg_wl(self, new_workload: Workload) -> None: def change_bg_wl(self, new_workload: Workload) -> None: self._background_wl = new_workload + + @abstractmethod + def store_cur_config(self) -> None: + """Store the current configuration""" + pass + + @abstractmethod + def load_cur_config(self) -> None: + """Load the current configuration""" + pass diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 6bc1efb..33f8b80 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -1,7 +1,7 @@ # coding: UTF-8 import logging -from typing import Optional +from typing import Optional, Tuple from .base_isolator import Isolator from .. import NextStep @@ -19,6 +19,8 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._prev_step: Optional[int] = None self._cur_step: Optional[int] = None + self._stored_config: Tuple[str, ...] = None + def strengthen(self) -> 'CacheIsolator': self._prev_step = self._cur_step @@ -124,3 +126,13 @@ def reset(self) -> None: if self._foreground_wl.is_running: masks[self._foreground_wl.cur_socket_id()] = ResCtrl.MAX_MASK self._foreground_wl.resctrl.assign_llc(*masks) + + def store_cur_config(self) -> None: + fg_resctrl = self._foreground_wl.resctrl + fg_mask = fg_resctrl.get_llc_mask() + bg_resctrl = self._background_wl.resctrl + bg_mask = bg_resctrl.get_llc_mask() + self._stored_config = (fg_mask, bg_mask) + + def load_cur_config(self): + return self._stored_config diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 919f855..0214bf8 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -2,6 +2,7 @@ import logging +from typing import Tuple, Set from .base_isolator import Isolator from .. import NextStep, ResourceType from ...workload import Workload @@ -24,6 +25,8 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._contentious_resource: ResourceType = ResourceType.MEMORY + self._stored_config: Tuple[Set[int], ...] = None + def strengthen(self) -> 'CoreIsolator': """ Strengthen reduces the number of CPUs assigned to BG workloads and increase that of FG workload @@ -208,7 +211,7 @@ def reset(self) -> None: @staticmethod def _is_more_core_benefit(wl: Workload) -> bool: wl_threads = wl.number_of_threads - wl_cpus= len(wl.cgroup_cpuset.read_cpus()) + wl_cpus = len(wl.cgroup_cpuset.read_cpus()) print(f'{wl.wl_type}, {wl.name}, threads : {wl_threads}, len(cpuset): {wl_cpus}') if wl_threads > wl_cpus: return True @@ -218,9 +221,19 @@ def _is_more_core_benefit(wl: Workload) -> bool: @staticmethod def _is_less_core_benefit(wl: Workload) -> bool: wl_threads = wl.number_of_threads - wl_cpus= len(wl.cgroup_cpuset.read_cpus()) + wl_cpus = len(wl.cgroup_cpuset.read_cpus()) print(f'{wl.wl_type}, {wl.name}, threads : {wl_threads}, len(cpuset): {wl_cpus}') if wl_threads < wl_cpus: return True else: return False + + def store_cur_config(self) -> None: + fg_cgroup_cpuset = self._foreground_wl.cgroup_cpuset + bg_cgroup_cpuset = self._background_wl.cgroup_cpuset + fg_cpuset = fg_cgroup_cpuset.read_cpus() + bg_cpuset = bg_cgroup_cpuset.read_cpus() + self._stored_config = (fg_cpuset, bg_cpuset) + + def load_cur_config(self): + return self._stored_config diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index f886f15..2253a9d 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -40,3 +40,6 @@ def _monitoring_result(self) -> NextStep: def reset(self) -> None: pass + + def store_cur_config(self) -> None: + pass \ No newline at end of file diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index cb9e8bb..e594ea0 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -2,6 +2,7 @@ import logging +from typing import Tuple, Dict from .base_isolator import Isolator from .. import NextStep from ...utils import DVFS @@ -17,6 +18,7 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: # FIXME: hard coded self._cur_step = DVFS.MAX + self._stored_config: Tuple[Dict[int, int], ...] = None def strengthen(self) -> 'MemoryIsolator': self._cur_step -= DVFS.STEP @@ -86,3 +88,13 @@ def _monitoring_result(self) -> NextStep: def reset(self) -> None: DVFS.set_freq(DVFS.MAX, self._background_wl.orig_bound_cores) + + def store_cur_config(self) -> None: + fg_rapl_dvfs = self._foreground_wl.dvfs + bg_rapl_dvfs = self._background_wl.dvfs + fg_dvfs = fg_rapl_dvfs.cpufreq + bg_dvfs = bg_rapl_dvfs.cpufreq + self._stored_config = (fg_dvfs, bg_dvfs) + + def load_cur_config(self): + return self._stored_config diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index de2c6c9..65a85ea 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -1,7 +1,7 @@ # coding: UTF-8 import logging from abc import ABCMeta, abstractmethod -from typing import Dict, Type +from typing import Dict, Type, Any from .. import ResourceType from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator @@ -22,6 +22,8 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR self._aggr_inst_diff: float = None + self._isolator_configs: Dict[Type[Isolator], Any] = dict() + self._profile_stop_cond: int = None # the count to stop solorun profiling condition def __hash__(self) -> int: return id(self) @@ -56,7 +58,8 @@ def contentious_resource(self) -> ResourceType: logger = logging.getLogger(__name__) logger.info(repr(metric_diff)) - logger.info(f'l3_int: {cur_metric.l3_intensity}, mem_int: {cur_metric.mem_intensity}, llc_util: {cur_metric.l3_util}') + logger.info(f'l3_int: {cur_metric.l3_intensity}, mem_int: {cur_metric.mem_intensity}, ' + f'llc_util: {cur_metric.l3_util}') if abs(cur_metric.l3_intensity) < IsolationPolicy._CPU_THRESHOLD \ and abs(cur_metric.mem_intensity) < IsolationPolicy._CPU_THRESHOLD: return ResourceType.CPU @@ -173,3 +176,114 @@ def set_idle_isolator(self) -> None: def reset(self) -> None: for isolator in self._isolator_map.values(): isolator.reset() + + def store_cur_configs(self) -> None: + for isotype, isolator in self._isolator_map.items(): + isolator.store_cur_config() + self._isolator_configs[isotype] = isolator.load_cur_config + + def reset_stored_configs(self) -> None: + """ + Reset stored configs + """ + # Cpuset (Cpuset) + cpuset_config = self._isolator_configs[CoreIsolator] + fg_cpuset, bg_cpuset = cpuset_config + self._fg_wl.cgroup_cpuset.assign_cpus(fg_cpuset) + self._bg_wl.cgroup_cpuset.assign_cpus(bg_cpuset) + + # DVFS (Dict(cpuid, freq)) + dvfs_config = self._isolator_configs[MemoryIsolator] + fg_dvfs_config, bg_dvfs_config = dvfs_config + fg_cpuset = fg_dvfs_config.keys() + fg_cpufreq = fg_dvfs_config.values() + fg_dvfs = self._fg_wl.dvfs + for fg_cpu in fg_cpuset: + freq = fg_cpufreq[fg_cpu] + fg_dvfs.set_freq(freq, fg_cpu) + + bg_cpuset = bg_dvfs_config.keys() + bg_cpufreq = bg_dvfs_config.values() + bg_dvfs = self._bg_wl.dvfs + for bg_cpu in bg_cpuset: + freq = bg_cpufreq[bg_cpu] + bg_dvfs.set_freq(freq, bg_cpu) + + # ResCtrl (Mask) + resctrl_config = self._isolator_configs[CacheIsolator] + fg_mask, bg_mask = resctrl_config + self._fg_wl.resctrl.assign_llc(fg_mask) + self._bg_wl.resctrl.assign_llc(bg_mask) + + def profile_solorun(self) -> None: + """ + profile solorun status of a workload + :return: + """ + # suspend all workloads and their perf agents + all_fg_wls = list() + all_bg_wls = list() + fg_wl = self.foreground_workload + bg_wl = self.background_workload + fg_wl.pause() + fg_wl.pause_perf() + bg_wl.pause() + bg_wl.pause_perf() + all_fg_wls.append(fg_wl) + all_bg_wls.append(bg_wl) + + # run FG workloads alone + for fg_wl in all_fg_wls: + fg_wl.profile_solorun = True + fg_wl.resume() + fg_wl.resume_perf() + + def _update_all_workloads_num_threads(self): + """ + update the workloads' number of threads (cur_num_threads -> prev_num_threads) + :return: + """ + bg_wl = self.background_workload + fg_wl = self.foreground_workload + bg_wl.update_num_threads() + fg_wl.update_num_threads() + + def profile_needed(self, profile_interval, schedule_interval, count: int) -> bool: + """ + This function checks if the profiling procedure should be called + + profile_freq : the frequencies of online profiling + :param profile_interval: the frequency of attempting profiling solorun + :param schedule_interval: the frequency of scheduling (isolation) + :param count: This counts the number of entering the run func. loop + :return: Decision whether to initiate online solorun profiling + """ + + profile_freq = int(profile_interval/schedule_interval) + fg_wl = self.foreground_workload + if count % profile_freq != 0 and fg_wl.is_num_threads_changed(): + self._update_all_workloads_num_threads() + return False + else: + self._update_all_workloads_num_threads() + return True + + @property + def profile_stop_cond(self) -> int: + return self._profile_stop_cond + + @profile_stop_cond.setter + def profile_stop_cond(self, new_count: int) -> None: + self._profile_stop_cond = new_count + + def all_workload_pause(self): + self._fg_wl.pause() + self._fg_wl.pause_perf() + self._bg_wl.pause() + self._bg_wl.pause_perf() + + def all_workload_resume(self): + self._fg_wl.resume() + self._fg_wl.resume_perf() + self._bg_wl.resume() + self._bg_wl.resume_perf() \ No newline at end of file diff --git a/isolating_controller/utils/cgroup/cpuset.py b/isolating_controller/utils/cgroup/cpuset.py index 086e2c0..b82be82 100644 --- a/isolating_controller/utils/cgroup/cpuset.py +++ b/isolating_controller/utils/cgroup/cpuset.py @@ -9,6 +9,7 @@ class CpuSet(BaseCgroup): + MOUNT_POINT = '/sys/fs/cgroup/cpuset' CONTROLLER = 'cpuset' def assign_cpus(self, core_set: Iterable[int]) -> None: @@ -33,3 +34,15 @@ def read_mems(self) -> Set[int]: if mems is '': raise ProcessLookupError() return convert_to_set(mems) + + def get_cpu_affinity_from_group(self) -> Set[int]: + with open(f'{CpuSet.MOUNT_POINT}/{self._group_name}/cpuset.cpus', "r") as fp: + line: str = fp.readline() + core_set: Set[int] = convert_to_set(line) + return core_set + + def get_mem_affinity_from_group(self) -> Set[int]: + with open(f'{CpuSet.MOUNT_POINT}/{self._group_name}/cpuset.mems', "r") as fp: + line: str = fp.readline() + mem_set: Set[int] = convert_to_set(line) + return mem_set diff --git a/isolating_controller/utils/dvfs.py b/isolating_controller/utils/dvfs.py index 8a0593c..fc1c9bf 100644 --- a/isolating_controller/utils/dvfs.py +++ b/isolating_controller/utils/dvfs.py @@ -2,7 +2,9 @@ import subprocess from pathlib import Path -from typing import Iterable +from typing import Iterable, Dict +from itertools import chain +from isolating_controller.utils.cgroup import CpuSet, Cpu class DVFS: @@ -10,8 +12,44 @@ class DVFS: STEP = 100000 MAX = int(Path('/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq').read_text()) + def __init__(self, group_name): + self._group_name: str = group_name + self._cur_cgroup = CpuSet(self._group_name) + self._cpufreq: Dict[int, int] = dict() + + # FIXME: hard coded to max freq. + self.set_freq_cgroup(DVFS.MAX) + + def set_freq_cgroup(self, target_freq: int): + """ + Set the frequencies to current cgroup cpusets + :param target_freq: freq. to set to cgroup cpuset + :return: + """ + cur_grp_cpuset = self._cur_cgroup.get_cpu_affinity_from_group() + DVFS.set_freq(target_freq, chain(cur_grp_cpuset)) + + @property + def cpufreq(self) -> Dict[int, int]: + """ + Return the cpufreq info + :return: _cpufreq is dict. key:val = cpu_id:cpu_freq + """ + return self._cpufreq + + def save_freq(self, freq: int): + cpuset = self._cpufreq.keys() + for cpu_id in cpuset: + self._cpufreq[cpu_id] = freq + @staticmethod def set_freq(freq: int, cores: Iterable[int]) -> None: + """ + Set the freq. to the specified cores + :param freq: freq. to set + :param cores: + :return: + """ for core in cores: subprocess.run(args=('sudo', 'tee', f'/sys/devices/system/cpu/cpu{core}/cpufreq/scaling_max_freq'), check=True, input=f'{freq}\n', encoding='ASCII', stdout=subprocess.DEVNULL) diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index a6817de..966dbae 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -77,3 +77,40 @@ def gen_mask(start: int, end: int = None) -> str: def remove_group(self) -> None: subprocess.check_call(args=('sudo', 'rmdir', str(self._group_path))) + + def get_llc_mask(self) -> List[str]: + """ + :return: `socket_masks` which is the elements of list in hex_str + """ + proc = subprocess.Popen(['cat', f'{ResCtrl.MOUNT_POINT}/{self._group_name}/schemata'], + stdout=subprocess.PIPE) + line = proc.communicate()[0] + striped_schema_line = line.lstrip('L3:').split(';') + socket_masks = list() + for i, item in enumerate(striped_schema_line): + mask = item.lstrip(f'{i}=') + socket_masks.append(mask) + return socket_masks + + @staticmethod + def get_llc_bits_from_mask(input_list: List[str]) -> List[int]: + """ + :param input_list: Assuming the elements of list is hex_str such as "0xfffff" + :return: + """ + output_list = list() + for mask in input_list: + hex_str = mask + hex_int = int(hex_str, 16) + bin_tmp = bin(hex_int) + llc_bits = len(bin_tmp.lstrip('0b')) + output_list.append(llc_bits) + return output_list + + def read_llc_bits(self) -> int: + socket_masks = self.get_llc_mask() + llc_bits_list = ResCtrl.get_llc_bits_from_mask(socket_masks) + ret_llc_bits = 0 + for llc_bits in llc_bits_list: + ret_llc_bits += llc_bits + return ret_llc_bits diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index da03f6f..782e041 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -9,7 +9,7 @@ from .metric_container.basic_metric import BasicMetric, MetricDiff from .solorun_data.datas import data_map -from .utils import ResCtrl, numa_topology +from .utils import ResCtrl, DVFS, numa_topology from .utils.cgroup import Cpu, CpuSet @@ -35,6 +35,7 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._cgroup_cpuset = CpuSet(self.group_name) self._cgroup_cpu = Cpu(self.group_name) self._resctrl = ResCtrl(self.group_name) + self._dvfs = DVFS(self.group_name) self._profile_solorun: bool = False self._solorun_data_queue: Deque[BasicMetric] = deque() # This queue is used to collect and calculate avg. status @@ -62,6 +63,10 @@ def cgroup_cpu(self) -> Cpu: def resctrl(self) -> ResCtrl: return self._resctrl + @property + def dvfs(self) -> DVFS: + return self._dvfs + @property def name(self) -> str: return self._name From 7c18e96cd2b2112cf79abde89da3de9d3ed30159 Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Mon, 8 Oct 2018 13:15:24 +0900 Subject: [PATCH 33/82] fix: Fix some minor bugs --- controller.py | 9 ++++++++- isolating_controller/isolation/isolators/idle.py | 3 +++ .../isolation/policies/base_policy.py | 14 ++++++++------ isolating_controller/utils/resctrl.py | 2 +- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/controller.py b/controller.py index 919008e..60474f0 100755 --- a/controller.py +++ b/controller.py @@ -160,13 +160,19 @@ def _isolate_workloads(self) -> None: for group, iteration_num in self._isolation_groups.items(): if group.profile_needed(self._profile_interval, self._interval, self._count): + logger.info(f'store_cur_configs') group.store_cur_configs() - group.profile_solorun() group.profile_stop_cond = self._count + int(self._solorun_interval/self._interval) + logger.info(f'profile_solorun ({self._count} ~ {group.profile_stop_cond})') + group.profile_solorun() elif group.foreground_workload.profile_solorun is True and self._count > group.profile_stop_cond: + logger.info(f'all_workload_pause') group.all_workload_pause() + logger.info(f'fg.profile_solorun = False') group.foreground_workload.profile_solorun = False + logger.info(f'reset_stored_configs') group.reset_stored_configs() + logger.info(f'all_workload_resume') group.all_workload_resume() logger.info('') @@ -200,6 +206,7 @@ def _isolate_workloads(self) -> None: finally: self._isolation_groups[group] += 1 + self._count += 1 def _register_pending_workloads(self) -> None: """ diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index 2253a9d..7d3cfb7 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -42,4 +42,7 @@ def reset(self) -> None: pass def store_cur_config(self) -> None: + pass + + def load_cur_config(self): pass \ No newline at end of file diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 65a85ea..c764ca3 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -180,7 +180,7 @@ def reset(self) -> None: def store_cur_configs(self) -> None: for isotype, isolator in self._isolator_map.items(): isolator.store_cur_config() - self._isolator_configs[isotype] = isolator.load_cur_config + self._isolator_configs[isotype] = isolator.load_cur_config() def reset_stored_configs(self) -> None: """ @@ -188,13 +188,13 @@ def reset_stored_configs(self) -> None: """ # Cpuset (Cpuset) cpuset_config = self._isolator_configs[CoreIsolator] - fg_cpuset, bg_cpuset = cpuset_config + (fg_cpuset, bg_cpuset) = cpuset_config self._fg_wl.cgroup_cpuset.assign_cpus(fg_cpuset) self._bg_wl.cgroup_cpuset.assign_cpus(bg_cpuset) # DVFS (Dict(cpuid, freq)) dvfs_config = self._isolator_configs[MemoryIsolator] - fg_dvfs_config, bg_dvfs_config = dvfs_config + (fg_dvfs_config, bg_dvfs_config) = dvfs_config fg_cpuset = fg_dvfs_config.keys() fg_cpufreq = fg_dvfs_config.values() fg_dvfs = self._fg_wl.dvfs @@ -211,7 +211,7 @@ def reset_stored_configs(self) -> None: # ResCtrl (Mask) resctrl_config = self._isolator_configs[CacheIsolator] - fg_mask, bg_mask = resctrl_config + (fg_mask, bg_mask) = resctrl_config self._fg_wl.resctrl.assign_llc(fg_mask) self._bg_wl.resctrl.assign_llc(bg_mask) @@ -258,10 +258,12 @@ def profile_needed(self, profile_interval, schedule_interval, count: int) -> boo :param count: This counts the number of entering the run func. loop :return: Decision whether to initiate online solorun profiling """ - + logger = logging.getLogger(__name__) profile_freq = int(profile_interval/schedule_interval) fg_wl = self.foreground_workload - if count % profile_freq != 0 and fg_wl.is_num_threads_changed(): + logger.info(f'count: {count}, profile_freq: {profile_freq}, ' + f'fg_wl.is_num_threads_changed(): {fg_wl.is_num_threads_changed()}') + if count % profile_freq != 0 or not fg_wl.is_num_threads_changed(): self._update_all_workloads_num_threads() return False else: diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index 966dbae..7c019f4 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -84,7 +84,7 @@ def get_llc_mask(self) -> List[str]: """ proc = subprocess.Popen(['cat', f'{ResCtrl.MOUNT_POINT}/{self._group_name}/schemata'], stdout=subprocess.PIPE) - line = proc.communicate()[0] + line = proc.communicate()[0].decode() striped_schema_line = line.lstrip('L3:').split(';') socket_masks = list() for i, item in enumerate(striped_schema_line): From 9c8faa2aabc84af379a53d6e77d9612c1d95793b Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Tue, 9 Oct 2018 15:39:32 +0900 Subject: [PATCH 34/82] fix: Fix code to calculate avg profiled solorun data --- controller.py | 17 +++++++++-- .../isolation/isolators/cache.py | 2 +- .../isolation/policies/base_policy.py | 29 ++++++++++++++++--- .../metric_container/basic_metric.py | 18 +++++++----- isolating_controller/utils/resctrl.py | 8 +++-- isolating_controller/workload.py | 24 +++++++++++++-- 6 files changed, 80 insertions(+), 18 deletions(-) diff --git a/controller.py b/controller.py index 60474f0..a5f79e8 100755 --- a/controller.py +++ b/controller.py @@ -110,7 +110,7 @@ def _cbk_wl_monitor(self, workload: Workload, elif workload.profile_solorun: logger.debug(f'Metric_queue : workload.profile_solorun') # init the solorun_data_queue - workload.solorun_data_queue.clear() + #workload.solorun_data_queue.clear() # suspend ALL BGs in the same socket metric_que = workload.solorun_data_queue @@ -163,15 +163,24 @@ def _isolate_workloads(self) -> None: logger.info(f'store_cur_configs') group.store_cur_configs() group.profile_stop_cond = self._count + int(self._solorun_interval/self._interval) + logger.info(f'reset_to_initial_configs') + group.reset() logger.info(f'profile_solorun ({self._count} ~ {group.profile_stop_cond})') group.profile_solorun() + elif group.foreground_workload.profile_solorun is True and self._count > group.profile_stop_cond: logger.info(f'all_workload_pause') group.all_workload_pause() + logger.info(f'fg.profile_solorun = False') group.foreground_workload.profile_solorun = False + logger.info(f'calc_and_update fg._avg_solorun_data') + #logger.info(f'fg_wl.solorun_data_queue: {group._fg_wl.solorun_data_queue}') + group.foreground_workload.calc_avg_solorun() + logger.info(f'fg_wl.avg_solorun_data: {group._fg_wl.avg_solorun_data}') logger.info(f'reset_stored_configs') group.reset_stored_configs() + logger.info(f'all_workload_resume') group.all_workload_resume() @@ -185,6 +194,10 @@ def _isolate_workloads(self) -> None: cur_isolator: Isolator = group.cur_isolator decided_next_step: NextStep = cur_isolator.decide_next_step() + + if group.fg_runs_alone is True: + decided_next_step = NextStep.IDLE + logger.info(f'Monitoring Result : {decided_next_step.name}') if decided_next_step is NextStep.STRENGTHEN: @@ -342,7 +355,7 @@ def main() -> None: module_logger.addHandler(stream_handler) monitoring_logger = logging.getLogger('monitoring') - monitoring_logger.setLevel(logging.INFO) + monitoring_logger.setLevel(logging.DEBUG) monitoring_logger.addHandler(stream_handler) controller = MainController(args.buf_size) diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 33f8b80..d5d6ce6 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -64,7 +64,7 @@ def _enforce(self) -> None: # FIXME: hard coded -> The number of socket is two at most masks = [ResCtrl.MIN_MASK, ResCtrl.MIN_MASK] - masks[self._foreground_wl.cur_socket_id()] = ResCtrl.gen_mask(self._cur_step) + masks[self._foreground_wl.cur_socket_id()] = ResCtrl.gen_mask(0, self._cur_step) self._foreground_wl.resctrl.assign_llc(*masks) # FIXME: hard coded -> The number of socket is two at most diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index c764ca3..8c2ef33 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -24,6 +24,8 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._aggr_inst_diff: float = None self._isolator_configs: Dict[Type[Isolator], Any] = dict() self._profile_stop_cond: int = None # the count to stop solorun profiling condition + self._thread_changed: bool = False + self._fg_runs_alone: bool = False def __hash__(self) -> int: return id(self) @@ -186,6 +188,7 @@ def reset_stored_configs(self) -> None: """ Reset stored configs """ + logger = logging.getLogger(__name__) # Cpuset (Cpuset) cpuset_config = self._isolator_configs[CoreIsolator] (fg_cpuset, bg_cpuset) = cpuset_config @@ -212,8 +215,10 @@ def reset_stored_configs(self) -> None: # ResCtrl (Mask) resctrl_config = self._isolator_configs[CacheIsolator] (fg_mask, bg_mask) = resctrl_config - self._fg_wl.resctrl.assign_llc(fg_mask) - self._bg_wl.resctrl.assign_llc(bg_mask) + logger.info(f'fg_mask: {fg_mask}, bg_mask: {bg_mask}') + logger.info(f'fg_path: {self._fg_wl.resctrl.MOUNT_POINT/self._fg_wl.group_name}') + self._fg_wl.resctrl.assign_llc(*fg_mask) + self._bg_wl.resctrl.assign_llc(*bg_mask) def profile_solorun(self) -> None: """ @@ -234,7 +239,9 @@ def profile_solorun(self) -> None: # run FG workloads alone for fg_wl in all_fg_wls: + fg_wl.solorun_data_queue.clear() # clear the prev. solorun data fg_wl.profile_solorun = True + self.fg_runs_alone = True fg_wl.resume() fg_wl.resume_perf() @@ -263,11 +270,16 @@ def profile_needed(self, profile_interval, schedule_interval, count: int) -> boo fg_wl = self.foreground_workload logger.info(f'count: {count}, profile_freq: {profile_freq}, ' f'fg_wl.is_num_threads_changed(): {fg_wl.is_num_threads_changed()}') - if count % profile_freq != 0 or not fg_wl.is_num_threads_changed(): + + if fg_wl.is_num_threads_changed(): + fg_wl.thread_changed_before = True + + if count % profile_freq != 0 or not fg_wl.thread_changed_before: self._update_all_workloads_num_threads() return False else: self._update_all_workloads_num_threads() + fg_wl.thread_changed_before = False return True @property @@ -288,4 +300,13 @@ def all_workload_resume(self): self._fg_wl.resume() self._fg_wl.resume_perf() self._bg_wl.resume() - self._bg_wl.resume_perf() \ No newline at end of file + self._bg_wl.resume_perf() + self.fg_runs_alone = False + + @property + def fg_runs_alone(self) -> bool: + return self._fg_runs_alone + + @fg_runs_alone.setter + def fg_runs_alone(self, new_val) -> None: + self._fg_runs_alone = new_val diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 8e2d28f..79b4b8a 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -5,7 +5,7 @@ from time import localtime, strftime from cpuinfo import cpuinfo - +from typing import Type LLC_SIZE = int(cpuinfo.get_cpu_info()['l3_cache_size'].split()[0]) * 1024 @@ -127,33 +127,37 @@ def l3_util(self) -> float: def __repr__(self) -> str: return ', '.join(map(str, ( - self._l2miss, self._l3miss, self._instructions, self._cycles, self._stall_cycles, - self._intra_coh, self._inter_coh, self._llc_size, self._req_date))) + self._l2miss, self._l3miss, self._instructions, self._cycles, self._stall_cycles, self._wall_cycles, + self._intra_coh, self._inter_coh, self._llc_size, self._local_mem, self._remote_mem, + self._interval, self._req_date))) - def __add__(self, others): + def __iadd__(self, others): self._l2miss = self.l2miss + others.l2miss self._l3miss = self.l3miss + others.l3miss self._instructions = self.instruction + others.instruction - self._wall_cycles = self.wall_cycles + others.wall_cycles + self._cycles = self._cycles + others.cycles self._stall_cycles = self.stall_cycle + others.stall_cycle + self._wall_cycles = self.wall_cycles + others.wall_cycles self._intra_coh = self.intra_coh + others.intra_coh self._inter_coh = self.inter_coh + others.inter_coh self._llc_size = self.llc_size + others.llc_size self._local_mem = self.local_mem + others.local_mem self._remote_mem = self.remote_mem + others.remote_mem + return self - def __truediv__(self, other): + def __truediv__(self, other: int): self._l2miss /= other self._l3miss /= other self._instructions /= other - self._wall_cycles /= other self._cycles /= other self._stall_cycles /= other + self._wall_cycles /= other self._intra_coh /= other self._inter_coh /= other self._llc_size /= other self._local_mem /= other self._remote_mem /= other + return self class MetricDiff: diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index 7c019f4..c2ff405 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -1,6 +1,7 @@ # coding: UTF-8 import re +import logging import subprocess from pathlib import Path from typing import List, Tuple @@ -46,8 +47,11 @@ def add_task(self, pid: int) -> None: input=f'{pid}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) def assign_llc(self, *masks: str) -> None: + logger = logging.getLogger(__name__) masks = (f'{i}={mask}' for i, mask in enumerate(masks)) mask = ';'.join(masks) + #subprocess.check_call('ls -ll /sys/fs/resctrl/', shell=True) + logger.info(f'mask: {mask}') subprocess.run(args=('sudo', 'tee', str(self._group_path / 'schemata')), input=f'L3:{mask}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) @@ -84,8 +88,8 @@ def get_llc_mask(self) -> List[str]: """ proc = subprocess.Popen(['cat', f'{ResCtrl.MOUNT_POINT}/{self._group_name}/schemata'], stdout=subprocess.PIPE) - line = proc.communicate()[0].decode() - striped_schema_line = line.lstrip('L3:').split(';') + line = proc.communicate()[0].decode().lstrip() + striped_schema_line = line.lstrip('L3:').rstrip('\n').split(';') socket_masks = list() for i, item in enumerate(striped_schema_line): mask = item.lstrip(f'{i}=') diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 782e041..f36fabf 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -41,6 +41,7 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._solorun_data_queue: Deque[BasicMetric] = deque() # This queue is used to collect and calculate avg. status self._avg_solorun_data: BasicMetric = None # This variable is used to contain the recent avg. status self._prev_num_threads: int = None + self._thread_changed_before: bool = False self._orig_bound_cores: Tuple[int, ...] = tuple(self._cgroup_cpuset.read_cpus()) self._orig_bound_mems: Set[int] = self._cgroup_cpuset.read_mems() @@ -139,6 +140,14 @@ def number_of_threads(self) -> int: def prev_num_threads(self) -> int: return self._prev_num_threads + @property + def thread_changed_before(self) -> bool: + return self._thread_changed_before + + @thread_changed_before.setter + def thread_changed_before(self, new_val) -> None: + self._thread_changed_before = new_val + def update_num_threads(self) -> None: self._prev_num_threads = self._proc_info.num_threads() @@ -159,21 +168,30 @@ def avg_solorun_data(self) -> BasicMetric: return self._avg_solorun_data def calc_avg_solorun(self) -> None: + logger = logging.getLogger(__name__) counts = 0 sum_of_items = BasicMetric() for item in self.solorun_data_queue: + logger.info(f'item in solorun_data_queue : {item}') sum_of_items += item + logger.info(f'sum_of_items[{counts}] : {sum_of_items}') counts += 1 - self._avg_solorun_data = sum_of_items / counts + logger.info(f'self.solorun_data_queue : {self.solorun_data_queue}') + logger.info(f'after sum, sum_of_items : {sum_of_items}') + self._avg_solorun_data = sum_of_items/counts + logger.info(f'after truediv, truediv_of_items : {self._avg_solorun_data}') def calc_metric_diff(self) -> MetricDiff: - logger=logging.getLogger(__name__) + logger = logging.getLogger(__name__) #solorun_data = data_map[self.name] if self._avg_solorun_data is not None: solorun_data = self._avg_solorun_data else: solorun_data = data_map[self.name] curr_metric: BasicMetric = self._metrics[0] + logger.info(f'solorun_data L3 hit ratio: {solorun_data.l3hit_ratio}, ' + f'Local Mem BW ps : {solorun_data.local_mem_ps()}, ' + f'Instruction ps. : {solorun_data.instruction_ps}') return MetricDiff(curr_metric, solorun_data) def all_child_tid(self) -> Tuple[int, ...]: @@ -216,3 +234,5 @@ def is_num_threads_changed(self) -> bool: return False else: return True + + From 8b2eff5cde19923433907c43baf03a4e4191c6fe Mon Sep 17 00:00:00 2001 From: Yoonsung Nam Date: Tue, 9 Oct 2018 17:57:28 +0900 Subject: [PATCH 35/82] fix: Fix exceptions and modify interval default value --- controller.py | 56 +++++++++---------- .../metric_container/basic_metric.py | 2 +- isolating_controller/workload.py | 12 +++- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/controller.py b/controller.py index a5f79e8..880c05a 100755 --- a/controller.py +++ b/controller.py @@ -159,35 +159,35 @@ def _isolate_workloads(self) -> None: self._swapper.try_swap() for group, iteration_num in self._isolation_groups.items(): - if group.profile_needed(self._profile_interval, self._interval, self._count): - logger.info(f'store_cur_configs') - group.store_cur_configs() - group.profile_stop_cond = self._count + int(self._solorun_interval/self._interval) - logger.info(f'reset_to_initial_configs') - group.reset() - logger.info(f'profile_solorun ({self._count} ~ {group.profile_stop_cond})') - group.profile_solorun() - - elif group.foreground_workload.profile_solorun is True and self._count > group.profile_stop_cond: - logger.info(f'all_workload_pause') - group.all_workload_pause() - - logger.info(f'fg.profile_solorun = False') - group.foreground_workload.profile_solorun = False - logger.info(f'calc_and_update fg._avg_solorun_data') - #logger.info(f'fg_wl.solorun_data_queue: {group._fg_wl.solorun_data_queue}') - group.foreground_workload.calc_avg_solorun() - logger.info(f'fg_wl.avg_solorun_data: {group._fg_wl.avg_solorun_data}') - logger.info(f'reset_stored_configs') - group.reset_stored_configs() - - logger.info(f'all_workload_resume') - group.all_workload_resume() - - logger.info('') - logger.info(f'***************isolation of {group.name} #{iteration_num}***************') - try: + if group.profile_needed(self._profile_interval, self._interval, self._count): + logger.info(f'store_cur_configs') + group.store_cur_configs() + group.profile_stop_cond = self._count + int(self._solorun_interval/self._interval) + logger.info(f'reset_to_initial_configs') + group.reset() + logger.info(f'profile_solorun ({self._count} ~ {group.profile_stop_cond})') + group.profile_solorun() + + elif group.foreground_workload.profile_solorun is True and self._count > group.profile_stop_cond: + logger.info(f'all_workload_pause') + group.all_workload_pause() + + logger.info(f'fg.profile_solorun = False') + group.foreground_workload.profile_solorun = False + logger.info(f'calc_and_update fg._avg_solorun_data') + #logger.info(f'fg_wl.solorun_data_queue: {group._fg_wl.solorun_data_queue}') + group.foreground_workload.calc_avg_solorun() + logger.info(f'fg_wl.avg_solorun_data: {group._fg_wl.avg_solorun_data}') + logger.info(f'reset_stored_configs') + group.reset_stored_configs() + + logger.info(f'all_workload_resume') + group.all_workload_resume() + + logger.info('') + logger.info(f'***************isolation of {group.name} #{iteration_num}***************') + if group.new_isolator_needed: group.choose_next_isolator() diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 79b4b8a..279a4a9 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -12,7 +12,7 @@ class BasicMetric: def __init__(self, l2miss=0, l3miss=0, inst=0, cycles=0, stall_cycles=0, wall_cycles=0, intra_coh=0, - inter_coh=0, llc_size=0, local_mem=0, remote_mem=0, interval: int=1000): + inter_coh=0, llc_size=0, local_mem=0, remote_mem=0, interval: int=50): self._l2miss = l2miss self._l3miss = l3miss self._instructions = inst diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index f36fabf..53ff7fd 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -134,7 +134,10 @@ def group_name(self) -> str: @property def number_of_threads(self) -> int: - return self._proc_info.num_threads() + try: + return self._proc_info.num_threads() + except psutil.NoSuchProcess: + return 0 @property def prev_num_threads(self) -> int: @@ -149,7 +152,10 @@ def thread_changed_before(self, new_val) -> None: self._thread_changed_before = new_val def update_num_threads(self) -> None: - self._prev_num_threads = self._proc_info.num_threads() + try: + self._prev_num_threads = self._proc_info.num_threads() + except psutil.NoSuchProcess: + self._prev_num_threads = 0 @property def profile_solorun(self) -> bool: @@ -170,7 +176,7 @@ def avg_solorun_data(self) -> BasicMetric: def calc_avg_solorun(self) -> None: logger = logging.getLogger(__name__) counts = 0 - sum_of_items = BasicMetric() + sum_of_items = BasicMetric(interval=50) for item in self.solorun_data_queue: logger.info(f'item in solorun_data_queue : {item}') sum_of_items += item From 51c5e18bce70eb04d4183d9195c247485759afb0 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 10 Oct 2018 12:41:20 +0900 Subject: [PATCH 36/82] reformat codes --- controller.py | 10 +++++----- isolating_controller/isolation/__init__.py | 2 +- .../isolation/isolators/__init__.py | 2 +- isolating_controller/isolation/isolators/core.py | 2 +- isolating_controller/isolation/isolators/idle.py | 4 ++-- .../isolation/isolators/memory.py | 2 +- .../isolation/policies/base_policy.py | 4 ++-- .../isolation/policies/diff_policy_cpu.py | 2 +- .../isolation/policies/greedy_diff_policy.py | 10 +++++----- .../policies/greedy_diff_with_violation_policy.py | 2 +- .../metric_container/basic_metric.py | 3 +-- isolating_controller/utils/dvfs.py | 7 ++++--- isolating_controller/utils/resctrl.py | 4 ++-- isolating_controller/workload.py | 15 +++++++-------- proc_arbitrator.py | 4 ++-- 15 files changed, 36 insertions(+), 37 deletions(-) diff --git a/controller.py b/controller.py index 880c05a..6b62bbb 100755 --- a/controller.py +++ b/controller.py @@ -110,7 +110,7 @@ def _cbk_wl_monitor(self, workload: Workload, elif workload.profile_solorun: logger.debug(f'Metric_queue : workload.profile_solorun') # init the solorun_data_queue - #workload.solorun_data_queue.clear() + # workload.solorun_data_queue.clear() # suspend ALL BGs in the same socket metric_que = workload.solorun_data_queue @@ -146,7 +146,7 @@ def __init__(self, pending_queue: PendingQueue) -> None: self._pending_queue: PendingQueue = pending_queue self._interval: float = 0.2 # scheduling interval (sec) - self._count: int = 0 # scheduling counts + self._count: int = 0 # scheduling counts self._profile_interval: float = 1.0 # check interval for phase change (sec) self._solorun_interval: float = 2.0 # the FG's solorun profiling interval (sec) self._isolation_groups: Dict[IsolationPolicy, int] = dict() @@ -163,7 +163,7 @@ def _isolate_workloads(self) -> None: if group.profile_needed(self._profile_interval, self._interval, self._count): logger.info(f'store_cur_configs') group.store_cur_configs() - group.profile_stop_cond = self._count + int(self._solorun_interval/self._interval) + group.profile_stop_cond = self._count + int(self._solorun_interval / self._interval) logger.info(f'reset_to_initial_configs') group.reset() logger.info(f'profile_solorun ({self._count} ~ {group.profile_stop_cond})') @@ -176,7 +176,7 @@ def _isolate_workloads(self) -> None: logger.info(f'fg.profile_solorun = False') group.foreground_workload.profile_solorun = False logger.info(f'calc_and_update fg._avg_solorun_data') - #logger.info(f'fg_wl.solorun_data_queue: {group._fg_wl.solorun_data_queue}') + # logger.info(f'fg_wl.solorun_data_queue: {group._fg_wl.solorun_data_queue}') group.foreground_workload.calc_avg_solorun() logger.info(f'fg_wl.avg_solorun_data: {group._fg_wl.avg_solorun_data}') logger.info(f'reset_stored_configs') @@ -310,7 +310,7 @@ def _profile_needed(self, count: int) -> bool: :return: Decision whether to initiate online solorun profiling """ - profile_freq = int(self._profile_interval/self._interval) + profile_freq = int(self._profile_interval / self._interval) for group in self._isolation_groups: fg_wl = group.foreground_workload if count % profile_freq != 0 and fg_wl.is_num_threads_changed(): diff --git a/isolating_controller/isolation/__init__.py b/isolating_controller/isolation/__init__.py index 999192f..9075717 100644 --- a/isolating_controller/isolation/__init__.py +++ b/isolating_controller/isolation/__init__.py @@ -14,4 +14,4 @@ class ResourceType(IntEnum): CPU = 0 CACHE = 1 MEMORY = 2 - Unknown = 3 \ No newline at end of file + Unknown = 3 diff --git a/isolating_controller/isolation/isolators/__init__.py b/isolating_controller/isolation/isolators/__init__.py index 3eecaf2..6bd83a0 100644 --- a/isolating_controller/isolation/isolators/__init__.py +++ b/isolating_controller/isolation/isolators/__init__.py @@ -3,7 +3,7 @@ from .base_isolator import Isolator from .cache import CacheIsolator +from .core import CoreIsolator from .idle import IdleIsolator from .memory import MemoryIsolator -from .core import CoreIsolator from .schedule import SchedIsolator diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 0214bf8..cb8fa21 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -1,8 +1,8 @@ # coding: UTF-8 import logging +from typing import Set, Tuple -from typing import Tuple, Set from .base_isolator import Isolator from .. import NextStep, ResourceType from ...workload import Workload diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index 7d3cfb7..6354e08 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -43,6 +43,6 @@ def reset(self) -> None: def store_cur_config(self) -> None: pass - + def load_cur_config(self): - pass \ No newline at end of file + pass diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index e594ea0..0816d8b 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -1,8 +1,8 @@ # coding: UTF-8 import logging +from typing import Dict, Tuple -from typing import Tuple, Dict from .base_isolator import Isolator from .. import NextStep from ...utils import DVFS diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 8c2ef33..02f3431 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -1,7 +1,7 @@ # coding: UTF-8 import logging from abc import ABCMeta, abstractmethod -from typing import Dict, Type, Any +from typing import Any, Dict, Type from .. import ResourceType from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator @@ -266,7 +266,7 @@ def profile_needed(self, profile_interval, schedule_interval, count: int) -> boo :return: Decision whether to initiate online solorun profiling """ logger = logging.getLogger(__name__) - profile_freq = int(profile_interval/schedule_interval) + profile_freq = int(profile_interval / schedule_interval) fg_wl = self.foreground_workload logger.info(f'count: {count}, profile_freq: {profile_freq}, ' f'fg_wl.is_num_threads_changed(): {fg_wl.is_num_threads_changed()}') diff --git a/isolating_controller/isolation/policies/diff_policy_cpu.py b/isolating_controller/isolation/policies/diff_policy_cpu.py index e61893e..209a00e 100644 --- a/isolating_controller/isolation/policies/diff_policy_cpu.py +++ b/isolating_controller/isolation/policies/diff_policy_cpu.py @@ -2,8 +2,8 @@ import logging -from .. import ResourceType from .base_policy import IsolationPolicy +from .. import ResourceType from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 096b822..8a89c0f 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -4,7 +4,7 @@ from .base_policy import IsolationPolicy from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload @@ -27,20 +27,20 @@ def choose_next_isolator(self) -> bool: if resource is ResourceType.CPU: self._cur_isolator = self._isolator_map[CoreIsolator] self._cur_isolator._contentious_resource = ResourceType.CPU - #logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') + # logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') logger.info(f'Resource Type: {ResourceType.CPU.name}, CoreIsolation') return True elif resource is ResourceType.CACHE: self._cur_isolator = self._isolator_map[CacheIsolator] - #logger.info(f'Cache Isolation for {self._fg_wl} is started') + # logger.info(f'Cache Isolation for {self._fg_wl} is started') logger.info(f'Resource Type: {ResourceType.CACHE.name}, CacheIsolation') return True elif not self._is_mem_isolated and resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[MemoryIsolator] self._is_mem_isolated = True - #logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started') + # logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started') logger.info(f'Resource Type: {ResourceType.MEMORY.name}, MemoryIsolation') return True @@ -48,7 +48,7 @@ def choose_next_isolator(self) -> bool: self._cur_isolator = self._isolator_map[CoreIsolator] self._cur_isolator._contentious_resource = ResourceType.MEMORY self._is_mem_isolated = False - #logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') + # logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') logger.info(f'Resource Type: {ResourceType.MEMORY.name}, CoreIsolation') return True diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index fd240b5..0a1b623 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -4,7 +4,7 @@ from .greedy_diff_policy import GreedyDiffPolicy from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 279a4a9..ccd7c16 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -5,14 +5,13 @@ from time import localtime, strftime from cpuinfo import cpuinfo -from typing import Type LLC_SIZE = int(cpuinfo.get_cpu_info()['l3_cache_size'].split()[0]) * 1024 class BasicMetric: def __init__(self, l2miss=0, l3miss=0, inst=0, cycles=0, stall_cycles=0, wall_cycles=0, intra_coh=0, - inter_coh=0, llc_size=0, local_mem=0, remote_mem=0, interval: int=50): + inter_coh=0, llc_size=0, local_mem=0, remote_mem=0, interval: int = 50): self._l2miss = l2miss self._l3miss = l3miss self._instructions = inst diff --git a/isolating_controller/utils/dvfs.py b/isolating_controller/utils/dvfs.py index fc1c9bf..77b7a85 100644 --- a/isolating_controller/utils/dvfs.py +++ b/isolating_controller/utils/dvfs.py @@ -1,10 +1,11 @@ # coding: UTF-8 import subprocess -from pathlib import Path -from typing import Iterable, Dict from itertools import chain -from isolating_controller.utils.cgroup import CpuSet, Cpu +from pathlib import Path +from typing import Dict, Iterable + +from isolating_controller.utils.cgroup import CpuSet class DVFS: diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index c2ff405..b906b96 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -1,7 +1,7 @@ # coding: UTF-8 -import re import logging +import re import subprocess from pathlib import Path from typing import List, Tuple @@ -50,7 +50,7 @@ def assign_llc(self, *masks: str) -> None: logger = logging.getLogger(__name__) masks = (f'{i}={mask}' for i, mask in enumerate(masks)) mask = ';'.join(masks) - #subprocess.check_call('ls -ll /sys/fs/resctrl/', shell=True) + # subprocess.check_call('ls -ll /sys/fs/resctrl/', shell=True) logger.info(f'mask: {mask}') subprocess.run(args=('sudo', 'tee', str(self._group_path / 'schemata')), input=f'L3:{mask}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 53ff7fd..adbbf2f 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -1,15 +1,15 @@ # coding: UTF-8 +import logging from collections import deque from itertools import chain from typing import Deque, Iterable, Set, Tuple import psutil -import logging from .metric_container.basic_metric import BasicMetric, MetricDiff from .solorun_data.datas import data_map -from .utils import ResCtrl, DVFS, numa_topology +from .utils import DVFS, ResCtrl, numa_topology from .utils.cgroup import Cpu, CpuSet @@ -38,8 +38,9 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._dvfs = DVFS(self.group_name) self._profile_solorun: bool = False - self._solorun_data_queue: Deque[BasicMetric] = deque() # This queue is used to collect and calculate avg. status - self._avg_solorun_data: BasicMetric = None # This variable is used to contain the recent avg. status + self._solorun_data_queue: Deque[ + BasicMetric] = deque() # This queue is used to collect and calculate avg. status + self._avg_solorun_data: BasicMetric = None # This variable is used to contain the recent avg. status self._prev_num_threads: int = None self._thread_changed_before: bool = False @@ -184,12 +185,12 @@ def calc_avg_solorun(self) -> None: counts += 1 logger.info(f'self.solorun_data_queue : {self.solorun_data_queue}') logger.info(f'after sum, sum_of_items : {sum_of_items}') - self._avg_solorun_data = sum_of_items/counts + self._avg_solorun_data = sum_of_items / counts logger.info(f'after truediv, truediv_of_items : {self._avg_solorun_data}') def calc_metric_diff(self) -> MetricDiff: logger = logging.getLogger(__name__) - #solorun_data = data_map[self.name] + # solorun_data = data_map[self.name] if self._avg_solorun_data is not None: solorun_data = self._avg_solorun_data else: @@ -240,5 +241,3 @@ def is_num_threads_changed(self) -> bool: return False else: return True - - diff --git a/proc_arbitrator.py b/proc_arbitrator.py index cc1a4b5..db032ad 100644 --- a/proc_arbitrator.py +++ b/proc_arbitrator.py @@ -1,13 +1,13 @@ #!/usr/bin/env python # coding=UTF-8 -from __future__ import print_function, division +from __future__ import division, print_function import multiprocessing import os import sys import time -from signal import SIGSTOP, SIGCONT +from signal import SIGCONT, SIGSTOP from threading import Timer import psutil From 7285151ed79e4bf902fbdbae78dbd5c4c36dd26f Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 10 Oct 2018 14:25:06 +0900 Subject: [PATCH 37/82] fixes logging levels --- controller.py | 37 +++++++++++-------- .../isolation/policies/base_policy.py | 14 ++++--- isolating_controller/utils/resctrl.py | 3 -- isolating_controller/workload.py | 16 ++++---- pending_queue.py | 2 +- 5 files changed, 38 insertions(+), 34 deletions(-) diff --git a/controller.py b/controller.py index 6b62bbb..75efff4 100755 --- a/controller.py +++ b/controller.py @@ -2,9 +2,11 @@ # coding: UTF-8 import argparse +import datetime import functools import json import logging +import os import subprocess import sys import time @@ -104,11 +106,11 @@ def _cbk_wl_monitor(self, workload: Workload, metric_que = None if not workload.profile_solorun: - logger.debug(f'Metric_queue : workload.metrics') + logger.debug('Metric_queue : workload.metrics') # TODO: Do we need clear()? metric_que = workload.metrics elif workload.profile_solorun: - logger.debug(f'Metric_queue : workload.profile_solorun') + logger.debug('Metric_queue : workload.profile_solorun') # init the solorun_data_queue # workload.solorun_data_queue.clear() # suspend ALL BGs in the same socket @@ -120,7 +122,7 @@ def _cbk_wl_monitor(self, workload: Workload, metric_que.appendleft(item) def run(self) -> None: - logger = logging.getLogger(__name__) + logger = logging.getLogger('monitoring') self._control_thread.start() @@ -159,34 +161,37 @@ def _isolate_workloads(self) -> None: self._swapper.try_swap() for group, iteration_num in self._isolation_groups.items(): + logger.info('') + logger.info(f'***************isolation of {group.name} #{iteration_num}***************') + try: if group.profile_needed(self._profile_interval, self._interval, self._count): - logger.info(f'store_cur_configs') + logger.debug('store_cur_configs') group.store_cur_configs() group.profile_stop_cond = self._count + int(self._solorun_interval / self._interval) - logger.info(f'reset_to_initial_configs') + logger.debug('reset_to_initial_configs') group.reset() - logger.info(f'profile_solorun ({self._count} ~ {group.profile_stop_cond})') + logger.debug(f'profile_solorun ({self._count} ~ {group.profile_stop_cond})') group.profile_solorun() - elif group.foreground_workload.profile_solorun is True and self._count > group.profile_stop_cond: - logger.info(f'all_workload_pause') + logger.info('skipping isolation because of solorun profiling...') + + elif group.foreground_workload.profile_solorun and self._count > group.profile_stop_cond: + logger.debug('all_workload_pause') group.all_workload_pause() - logger.info(f'fg.profile_solorun = False') + logger.debug('fg.profile_solorun = False') group.foreground_workload.profile_solorun = False - logger.info(f'calc_and_update fg._avg_solorun_data') - # logger.info(f'fg_wl.solorun_data_queue: {group._fg_wl.solorun_data_queue}') + logger.debug('calc_and_update fg._avg_solorun_data') group.foreground_workload.calc_avg_solorun() - logger.info(f'fg_wl.avg_solorun_data: {group._fg_wl.avg_solorun_data}') - logger.info(f'reset_stored_configs') + logger.debug(f'fg_wl.avg_solorun_data: {group.foreground_workload.avg_solorun_data}') + logger.debug('reset_stored_configs') group.reset_stored_configs() - logger.info(f'all_workload_resume') + logger.debug('all_workload_resume') group.all_workload_resume() - logger.info('') - logger.info(f'***************isolation of {group.name} #{iteration_num}***************') + logger.info('skipping isolation... because corun data isn\'t collected yet') if group.new_isolator_needed: group.choose_next_isolator() diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 02f3431..4bc02bd 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -1,4 +1,5 @@ # coding: UTF-8 + import logging from abc import ABCMeta, abstractmethod from typing import Any, Dict, Type @@ -60,8 +61,9 @@ def contentious_resource(self) -> ResourceType: logger = logging.getLogger(__name__) logger.info(repr(metric_diff)) - logger.info(f'l3_int: {cur_metric.l3_intensity}, mem_int: {cur_metric.mem_intensity}, ' - f'llc_util: {cur_metric.l3_util}') + logger.info(f'l3_int: {cur_metric.l3_intensity:>7.04f}, ' + f'mem_int: {cur_metric.mem_intensity:>7.04f}, ' + f'llc_util: {cur_metric.l3_util:>7.04f}') if abs(cur_metric.l3_intensity) < IsolationPolicy._CPU_THRESHOLD \ and abs(cur_metric.mem_intensity) < IsolationPolicy._CPU_THRESHOLD: return ResourceType.CPU @@ -215,8 +217,8 @@ def reset_stored_configs(self) -> None: # ResCtrl (Mask) resctrl_config = self._isolator_configs[CacheIsolator] (fg_mask, bg_mask) = resctrl_config - logger.info(f'fg_mask: {fg_mask}, bg_mask: {bg_mask}') - logger.info(f'fg_path: {self._fg_wl.resctrl.MOUNT_POINT/self._fg_wl.group_name}') + logger.debug(f'fg_mask: {fg_mask}, bg_mask: {bg_mask}') + logger.debug(f'fg_path: {self._fg_wl.resctrl.MOUNT_POINT/self._fg_wl.group_name}') self._fg_wl.resctrl.assign_llc(*fg_mask) self._bg_wl.resctrl.assign_llc(*bg_mask) @@ -268,8 +270,8 @@ def profile_needed(self, profile_interval, schedule_interval, count: int) -> boo logger = logging.getLogger(__name__) profile_freq = int(profile_interval / schedule_interval) fg_wl = self.foreground_workload - logger.info(f'count: {count}, profile_freq: {profile_freq}, ' - f'fg_wl.is_num_threads_changed(): {fg_wl.is_num_threads_changed()}') + logger.debug(f'count: {count}, profile_freq: {profile_freq}, ' + f'fg_wl.is_num_threads_changed(): {fg_wl.is_num_threads_changed()}') if fg_wl.is_num_threads_changed(): fg_wl.thread_changed_before = True diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index b906b96..773a6b8 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -1,6 +1,5 @@ # coding: UTF-8 -import logging import re import subprocess from pathlib import Path @@ -47,11 +46,9 @@ def add_task(self, pid: int) -> None: input=f'{pid}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) def assign_llc(self, *masks: str) -> None: - logger = logging.getLogger(__name__) masks = (f'{i}={mask}' for i, mask in enumerate(masks)) mask = ';'.join(masks) # subprocess.check_call('ls -ll /sys/fs/resctrl/', shell=True) - logger.info(f'mask: {mask}') subprocess.run(args=('sudo', 'tee', str(self._group_path / 'schemata')), input=f'L3:{mask}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index adbbf2f..ed75e7d 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -179,14 +179,14 @@ def calc_avg_solorun(self) -> None: counts = 0 sum_of_items = BasicMetric(interval=50) for item in self.solorun_data_queue: - logger.info(f'item in solorun_data_queue : {item}') + logger.debug(f'item in solorun_data_queue : {item}') sum_of_items += item - logger.info(f'sum_of_items[{counts}] : {sum_of_items}') + logger.debug(f'sum_of_items[{counts}] : {sum_of_items}') counts += 1 - logger.info(f'self.solorun_data_queue : {self.solorun_data_queue}') - logger.info(f'after sum, sum_of_items : {sum_of_items}') + logger.debug(f'self.solorun_data_queue : {self.solorun_data_queue}') + logger.debug(f'after sum, sum_of_items : {sum_of_items}') self._avg_solorun_data = sum_of_items / counts - logger.info(f'after truediv, truediv_of_items : {self._avg_solorun_data}') + logger.debug(f'after truediv, truediv_of_items : {self._avg_solorun_data}') def calc_metric_diff(self) -> MetricDiff: logger = logging.getLogger(__name__) @@ -196,9 +196,9 @@ def calc_metric_diff(self) -> MetricDiff: else: solorun_data = data_map[self.name] curr_metric: BasicMetric = self._metrics[0] - logger.info(f'solorun_data L3 hit ratio: {solorun_data.l3hit_ratio}, ' - f'Local Mem BW ps : {solorun_data.local_mem_ps()}, ' - f'Instruction ps. : {solorun_data.instruction_ps}') + logger.debug(f'solorun_data L3 hit ratio: {solorun_data.l3hit_ratio}, ' + f'Local Mem BW ps : {solorun_data.local_mem_ps()}, ' + f'Instruction ps. : {solorun_data.instruction_ps}') return MetricDiff(curr_metric, solorun_data) def all_child_tid(self) -> Tuple[int, ...]: diff --git a/pending_queue.py b/pending_queue.py index 52d83b2..7b81419 100644 --- a/pending_queue.py +++ b/pending_queue.py @@ -23,7 +23,7 @@ def __len__(self) -> int: self._pending_list))) def add(self, workload: Workload) -> None: - logger = logging.getLogger(__name__) + logger = logging.getLogger('monitoring.pending_queue') logger.info(f'{workload} is ready for active') ready_queue = self._ready_queue[workload.cur_socket_id()] From 5af4cc1758682f0de1d8bf8bf63bce86e6dee0e3 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 10 Oct 2018 14:26:10 +0900 Subject: [PATCH 38/82] add FileHandler to loggers --- controller.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/controller.py b/controller.py index 75efff4..b76bce9 100755 --- a/controller.py +++ b/controller.py @@ -345,23 +345,30 @@ def main() -> None: parser.add_argument('-b', '--metric-buf-size', dest='buf_size', default='50', type=int, help='metric buffer size per thread. (default : 50)') + os.makedirs('logs', exist_ok=True) + args = parser.parse_args() - # stream_handler = logging.StreamHandler() - stream_handler = logging.FileHandler('debug.log') - stream_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s]: %(message)s')) + formatter = logging.Formatter('%(asctime)s [%(levelname)s]: %(message)s') + stream_handler = logging.StreamHandler() + file_handler = logging.FileHandler(f'logs/debug_{datetime.datetime.now().isoformat()}.log') + stream_handler.setFormatter(formatter) + file_handler.setFormatter(formatter) controller_logger = logging.getLogger(__name__) controller_logger.setLevel(logging.INFO) controller_logger.addHandler(stream_handler) + controller_logger.addHandler(file_handler) module_logger = logging.getLogger(isolating_controller.__name__) module_logger.setLevel(logging.DEBUG) module_logger.addHandler(stream_handler) + module_logger.addHandler(file_handler) monitoring_logger = logging.getLogger('monitoring') monitoring_logger.setLevel(logging.DEBUG) monitoring_logger.addHandler(stream_handler) + monitoring_logger.addHandler(file_handler) controller = MainController(args.buf_size) controller.run() From b972f61dab454a33e3b852817080cbe3648aca25 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 10 Oct 2018 16:56:56 +0900 Subject: [PATCH 39/82] merge resume_perf and pause_perf into resume and pause --- controller.py | 4 ---- isolating_controller/isolation/policies/base_policy.py | 7 ------- isolating_controller/workload.py | 6 +----- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/controller.py b/controller.py index b76bce9..976d0d8 100755 --- a/controller.py +++ b/controller.py @@ -271,9 +271,7 @@ def _profile_solorun(self) -> None: fg_wl = group.foreground_workload bg_wl = group.background_workload fg_wl.pause() - fg_wl.pause_perf() bg_wl.pause() - bg_wl.pause_perf() all_fg_wls.append(fg_wl) all_bg_wls.append(bg_wl) @@ -281,7 +279,6 @@ def _profile_solorun(self) -> None: for fg_wl in all_fg_wls: fg_wl.profile_solorun = True fg_wl.resume() - fg_wl.resume_perf() # four seconds for monitoring solo-run time.sleep(self._solorun_interval) @@ -293,7 +290,6 @@ def _profile_solorun(self) -> None: # resume BG workloads for bg_wl in all_bg_wls: bg_wl.resume() - bg_wl.resume_perf() def _update_all_workloads_num_threads(self): """ diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 4bc02bd..2af4804 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -233,9 +233,7 @@ def profile_solorun(self) -> None: fg_wl = self.foreground_workload bg_wl = self.background_workload fg_wl.pause() - fg_wl.pause_perf() bg_wl.pause() - bg_wl.pause_perf() all_fg_wls.append(fg_wl) all_bg_wls.append(bg_wl) @@ -245,7 +243,6 @@ def profile_solorun(self) -> None: fg_wl.profile_solorun = True self.fg_runs_alone = True fg_wl.resume() - fg_wl.resume_perf() def _update_all_workloads_num_threads(self): """ @@ -294,15 +291,11 @@ def profile_stop_cond(self, new_count: int) -> None: def all_workload_pause(self): self._fg_wl.pause() - self._fg_wl.pause_perf() self._bg_wl.pause() - self._bg_wl.pause_perf() def all_workload_resume(self): self._fg_wl.resume() - self._fg_wl.resume_perf() self._bg_wl.resume() - self._bg_wl.resume_perf() self.fg_runs_alone = False @property diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index ed75e7d..d509c1f 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -221,14 +221,10 @@ def cur_socket_id(self) -> int: def pause(self) -> None: self._proc_info.suspend() + self._perf_info.suspend() def resume(self) -> None: self._proc_info.resume() - - def pause_perf(self) -> None: - self._perf_info.suspend() - - def resume_perf(self) -> None: self._perf_info.resume() def is_num_threads_changed(self) -> bool: From fc7dfd621b3f5e44d3423fff69c7bb534ef75746 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 10 Oct 2018 21:43:22 +0900 Subject: [PATCH 40/82] refactor online profiling related codes remove offline data remove proc_arbitrator.py remove isolating_controller/utils/cat.py remove duplicated codes in utils merge solorun metric queue in Workload into unified metric queue reuse Workload._metrics change the store and load method of isolator's configurations change method signature of Isolator.{_first_decision, _monitoring_result} --- controller.py | 126 ++---------- .../isolation/isolators/base_isolator.py | 17 +- .../isolation/isolators/cache.py | 30 ++- .../isolation/isolators/core.py | 39 ++-- .../isolation/isolators/idle.py | 6 +- .../isolation/isolators/memory.py | 34 ++-- .../isolation/policies/base_policy.py | 184 ++++++----------- .../metric_container/basic_metric.py | 80 +++----- isolating_controller/solorun_data/__init__.py | 0 .../solorun_data/canneal.json | 1 - isolating_controller/solorun_data/datas.py | 33 ---- .../solorun_data/facesim.json | 1 - .../solorun_data/fluidanimate.json | 1 - .../solorun_data/freqmine.json | 1 - isolating_controller/solorun_data/kmeans.json | 1 - isolating_controller/solorun_data/nn.json | 1 - .../solorun_data/particlefilter.json | 1 - .../solorun_data/raytrace.json | 1 - isolating_controller/solorun_data/sp.json | 1 - .../solorun_data/streamcluster.json | 1 - isolating_controller/utils/__init__.py | 1 - isolating_controller/utils/cat.py | 57 ------ isolating_controller/utils/cgroup/cpuset.py | 13 -- isolating_controller/utils/dvfs.py | 4 +- isolating_controller/utils/resctrl.py | 4 +- isolating_controller/workload.py | 84 ++------ proc_arbitrator.py | 185 ------------------ 27 files changed, 185 insertions(+), 722 deletions(-) delete mode 100644 isolating_controller/solorun_data/__init__.py delete mode 120000 isolating_controller/solorun_data/canneal.json delete mode 100644 isolating_controller/solorun_data/datas.py delete mode 120000 isolating_controller/solorun_data/facesim.json delete mode 120000 isolating_controller/solorun_data/fluidanimate.json delete mode 120000 isolating_controller/solorun_data/freqmine.json delete mode 120000 isolating_controller/solorun_data/kmeans.json delete mode 120000 isolating_controller/solorun_data/nn.json delete mode 120000 isolating_controller/solorun_data/particlefilter.json delete mode 120000 isolating_controller/solorun_data/raytrace.json delete mode 120000 isolating_controller/solorun_data/sp.json delete mode 120000 isolating_controller/solorun_data/streamcluster.json delete mode 100644 isolating_controller/utils/cat.py delete mode 100644 proc_arbitrator.py diff --git a/controller.py b/controller.py index 976d0d8..2dfa42f 100755 --- a/controller.py +++ b/controller.py @@ -11,7 +11,7 @@ import sys import time from threading import Thread -from typing import Dict +from typing import Dict, Optional import pika import psutil @@ -104,17 +104,7 @@ def _cbk_wl_monitor(self, workload: Workload, logger = logging.getLogger(f'monitoring.metric.{workload}') logger.debug(f'{metric} is given from ') - metric_que = None - if not workload.profile_solorun: - logger.debug('Metric_queue : workload.metrics') - # TODO: Do we need clear()? - metric_que = workload.metrics - elif workload.profile_solorun: - logger.debug('Metric_queue : workload.profile_solorun') - # init the solorun_data_queue - # workload.solorun_data_queue.clear() - # suspend ALL BGs in the same socket - metric_que = workload.solorun_data_queue + metric_que = workload.metrics if len(metric_que) == self._metric_buf_size: metric_que.pop() @@ -148,10 +138,12 @@ def __init__(self, pending_queue: PendingQueue) -> None: self._pending_queue: PendingQueue = pending_queue self._interval: float = 0.2 # scheduling interval (sec) - self._count: int = 0 # scheduling counts self._profile_interval: float = 1.0 # check interval for phase change (sec) self._solorun_interval: float = 2.0 # the FG's solorun profiling interval (sec) + self._solorun_count: Dict[IsolationPolicy, Optional[int]] = dict() + self._isolation_groups: Dict[IsolationPolicy, int] = dict() + # Swapper init self._swapper: SwapIsolator = SwapIsolator(self._isolation_groups) @@ -165,33 +157,25 @@ def _isolate_workloads(self) -> None: logger.info(f'***************isolation of {group.name} #{iteration_num}***************') try: - if group.profile_needed(self._profile_interval, self._interval, self._count): - logger.debug('store_cur_configs') - group.store_cur_configs() - group.profile_stop_cond = self._count + int(self._solorun_interval / self._interval) - logger.debug('reset_to_initial_configs') - group.reset() - logger.debug(f'profile_solorun ({self._count} ~ {group.profile_stop_cond})') - group.profile_solorun() + if group.in_solorun_profiling: + if iteration_num - self._solorun_count[group] >= int(self._solorun_interval / self._interval): + logger.info('Stopping solorun profiling...') - logger.info('skipping isolation because of solorun profiling...') + group.stop_solorun_profiling() + del self._solorun_count[group] - elif group.foreground_workload.profile_solorun and self._count > group.profile_stop_cond: - logger.debug('all_workload_pause') - group.all_workload_pause() + logger.info('skipping isolation... because corun data isn\'t collected yet') + else: + logger.info('skipping isolation because of solorun profiling...') - logger.debug('fg.profile_solorun = False') - group.foreground_workload.profile_solorun = False - logger.debug('calc_and_update fg._avg_solorun_data') - group.foreground_workload.calc_avg_solorun() - logger.debug(f'fg_wl.avg_solorun_data: {group.foreground_workload.avg_solorun_data}') - logger.debug('reset_stored_configs') - group.reset_stored_configs() - - logger.debug('all_workload_resume') - group.all_workload_resume() + continue - logger.info('skipping isolation... because corun data isn\'t collected yet') + # TODO: first expression can lead low reactivity + elif iteration_num % int(self._profile_interval / self._interval) == 0 and group.profile_needed(): + group.start_solorun_profiling() + self._solorun_count[group] = iteration_num + logger.info('skipping isolation because of solorun profiling...') + continue if group.new_isolator_needed: group.choose_next_isolator() @@ -199,10 +183,6 @@ def _isolate_workloads(self) -> None: cur_isolator: Isolator = group.cur_isolator decided_next_step: NextStep = cur_isolator.decide_next_step() - - if group.fg_runs_alone is True: - decided_next_step = NextStep.IDLE - logger.info(f'Monitoring Result : {decided_next_step.name}') if decided_next_step is NextStep.STRENGTHEN: @@ -224,7 +204,6 @@ def _isolate_workloads(self) -> None: finally: self._isolation_groups[group] += 1 - self._count += 1 def _register_pending_workloads(self) -> None: """ @@ -238,7 +217,6 @@ def _register_pending_workloads(self) -> None: logger.info(f'{pending_group} is created') self._isolation_groups[pending_group] = 0 - pending_group.init_isolators() def _remove_ended_groups(self) -> None: """ @@ -259,68 +237,6 @@ def _remove_ended_groups(self) -> None: group.reset() del self._isolation_groups[group] - def _profile_solorun(self) -> None: - """ - profile solorun status of a workload - :return: - """ - all_fg_wls = list() - all_bg_wls = list() - # suspend all workloads and their perf agents - for group in self._isolation_groups: - fg_wl = group.foreground_workload - bg_wl = group.background_workload - fg_wl.pause() - bg_wl.pause() - all_fg_wls.append(fg_wl) - all_bg_wls.append(bg_wl) - - # run FG workloads alone - for fg_wl in all_fg_wls: - fg_wl.profile_solorun = True - fg_wl.resume() - - # four seconds for monitoring solo-run - time.sleep(self._solorun_interval) - - # disable solorun mode - for fg_wl in all_fg_wls: - fg_wl.profile_solorun = False - - # resume BG workloads - for bg_wl in all_bg_wls: - bg_wl.resume() - - def _update_all_workloads_num_threads(self): - """ - update the workloads' number of threads (cur_num_threads -> prev_num_threads) - :return: - """ - for group in self._isolation_groups: - bg_wl = group.background_workload - fg_wl = group.foreground_workload - bg_wl.update_num_threads() - fg_wl.update_num_threads() - - def _profile_needed(self, count: int) -> bool: - """ - This function checks if the profiling procedure should be called - - profile_freq : the frequencies of online profiling - :param count: This counts the number of entering the run func. loop - :return: Decision whether to initiate online solorun profiling - """ - - profile_freq = int(self._profile_interval / self._interval) - for group in self._isolation_groups: - fg_wl = group.foreground_workload - if count % profile_freq != 0 and fg_wl.is_num_threads_changed(): - self._update_all_workloads_num_threads() - return False - else: - self._update_all_workloads_num_threads() - return True - def run(self) -> None: logger = logging.getLogger(__name__) logger.info('starting isolation loop') @@ -362,7 +278,7 @@ def main() -> None: module_logger.addHandler(file_handler) monitoring_logger = logging.getLogger('monitoring') - monitoring_logger.setLevel(logging.DEBUG) + monitoring_logger.setLevel(logging.INFO) monitoring_logger.addHandler(stream_handler) monitoring_logger.addHandler(file_handler) diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index d83da2f..a0aebdb 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -1,6 +1,7 @@ # coding: UTF-8 from abc import ABCMeta, abstractmethod +from typing import Any, Optional from .. import NextStep from ...metric_container.basic_metric import MetricDiff @@ -9,7 +10,7 @@ class Isolator(metaclass=ABCMeta): def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: - self._prev_metric_diff: MetricDiff = foreground_wl.calc_metric_diff() + self._prev_metric_diff: MetricDiff = None self._foreground_wl = foreground_wl self._background_wl = background_wl @@ -19,6 +20,8 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._is_first_decision: bool = True + self._stored_config: Optional[Any] = None + def __del__(self): self.reset() @@ -72,20 +75,20 @@ def yield_isolation(self) -> None: self._is_first_decision = True @abstractmethod - def _first_decision(self) -> NextStep: + def _first_decision(self, cur_metric_diff: MetricDiff) -> NextStep: pass @abstractmethod - def _monitoring_result(self) -> NextStep: + def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: pass def decide_next_step(self) -> NextStep: if self._is_first_decision: self._is_first_decision = False - return self._first_decision() + return self._first_decision(self._foreground_wl.calc_metric_diff()) else: - return self._monitoring_result() + return self._monitoring_result(self._prev_metric_diff, self._foreground_wl.calc_metric_diff()) @abstractmethod def reset(self) -> None: @@ -104,7 +107,7 @@ def store_cur_config(self) -> None: """Store the current configuration""" pass - @abstractmethod def load_cur_config(self) -> None: """Load the current configuration""" - pass + if self._stored_config is None: + raise ValueError('Store configuration first!') diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index d5d6ce6..7e7ad30 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -5,6 +5,7 @@ from .base_isolator import Isolator from .. import NextStep +from ...metric_container.basic_metric import MetricDiff from ...utils import ResCtrl, numa_topology from ...workload import Workload @@ -19,7 +20,7 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._prev_step: Optional[int] = None self._cur_step: Optional[int] = None - self._stored_config: Tuple[str, ...] = None + self._stored_config: Optional[Tuple[int, int]] = None def strengthen(self) -> 'CacheIsolator': self._prev_step = self._cur_step @@ -72,8 +73,7 @@ def _enforce(self) -> None: masks[self._background_wl.cur_socket_id()] = ResCtrl.gen_mask(self._cur_step) self._background_wl.resctrl.assign_llc(*masks) - def _first_decision(self) -> NextStep: - metric_diff = self._foreground_wl.calc_metric_diff() + def _first_decision(self, metric_diff: MetricDiff) -> NextStep: curr_diff = metric_diff.l3_hit_ratio logger = logging.getLogger(__name__) @@ -93,11 +93,9 @@ def _first_decision(self) -> NextStep: return NextStep.WEAKEN # TODO: consider turn off cache partitioning - def _monitoring_result(self) -> NextStep: - metric_diff = self._foreground_wl.calc_metric_diff() - - curr_diff = metric_diff.l3_hit_ratio - prev_diff = self._prev_metric_diff.l3_hit_ratio + def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: + curr_diff = cur_metric_diff.l3_hit_ratio + prev_diff = prev_metric_diff.l3_hit_ratio diff_of_diff = curr_diff - prev_diff logger = logging.getLogger(__name__) @@ -128,11 +126,11 @@ def reset(self) -> None: self._foreground_wl.resctrl.assign_llc(*masks) def store_cur_config(self) -> None: - fg_resctrl = self._foreground_wl.resctrl - fg_mask = fg_resctrl.get_llc_mask() - bg_resctrl = self._background_wl.resctrl - bg_mask = bg_resctrl.get_llc_mask() - self._stored_config = (fg_mask, bg_mask) - - def load_cur_config(self): - return self._stored_config + self._stored_config = (self._prev_step, self._cur_step) + + def load_cur_config(self) -> None: + super().load_cur_config() + + self._prev_step, self._cur_step = self._stored_config + self._enforce() + self._stored_config = None diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index cb8fa21..652ad33 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -1,10 +1,11 @@ # coding: UTF-8 import logging -from typing import Set, Tuple +from typing import Optional, Tuple from .base_isolator import Isolator from .. import NextStep, ResourceType +from ...metric_container.basic_metric import MetricDiff from ...workload import Workload @@ -25,7 +26,7 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._contentious_resource: ResourceType = ResourceType.MEMORY - self._stored_config: Tuple[Set[int], ...] = None + self._stored_config: Optional[Tuple[int, int]] = None def strengthen(self) -> 'CoreIsolator': """ @@ -77,9 +78,8 @@ def _enforce(self) -> None: self._foreground_wl.bound_cores = range(self._foreground_wl.orig_bound_cores[0], self._cur_fg_step + 1) self._background_wl.bound_cores = range(self._cur_bg_step, self._background_wl.orig_bound_cores[-1] + 1) - def _first_decision(self) -> NextStep: + def _first_decision(self, metric_diff: MetricDiff) -> NextStep: curr_diff = None - metric_diff = self._foreground_wl.calc_metric_diff() if self._contentious_resource == ResourceType.MEMORY: curr_diff = metric_diff.local_mem_util_ps @@ -105,20 +105,19 @@ def _first_decision(self) -> NextStep: else: return self._weaken_condition(metric_diff.instruction_ps) - def _monitoring_result(self) -> NextStep: + def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: logger = logging.getLogger(__name__) logger.info(f'self._contentious_resource: {self._contentious_resource.name}') - metric_diff = self._foreground_wl.calc_metric_diff() curr_diff = None diff_of_diff = None if self._contentious_resource == ResourceType.MEMORY: - curr_diff = metric_diff.local_mem_util_ps - prev_diff = self._prev_metric_diff.local_mem_util_ps + curr_diff = cur_metric_diff.local_mem_util_ps + prev_diff = prev_metric_diff.local_mem_util_ps diff_of_diff = curr_diff - prev_diff elif self._contentious_resource == ResourceType.CPU: - curr_diff = metric_diff.instruction_ps - prev_diff = self._prev_metric_diff.instruction_ps + curr_diff = cur_metric_diff.instruction_ps + prev_diff = prev_metric_diff.instruction_ps diff_of_diff = curr_diff - prev_diff logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') @@ -132,11 +131,11 @@ def _monitoring_result(self) -> NextStep: # Case2 : FG shows lower contention than solo-run -> Slower FG or Faster BG elif curr_diff > 0: - return self._weaken_condition(metric_diff.instruction_ps) + return self._weaken_condition(cur_metric_diff.instruction_ps) # Case3 : FG shows higher contention than solo-run else: - return self._strengthen_condition(metric_diff.instruction_ps) + return self._strengthen_condition(cur_metric_diff.instruction_ps) def _weaken_condition(self, fg_instruction_ps: float) -> NextStep: fg_not_used_cores = len(self._foreground_wl.bound_cores) - self._foreground_wl.number_of_threads @@ -229,11 +228,11 @@ def _is_less_core_benefit(wl: Workload) -> bool: return False def store_cur_config(self) -> None: - fg_cgroup_cpuset = self._foreground_wl.cgroup_cpuset - bg_cgroup_cpuset = self._background_wl.cgroup_cpuset - fg_cpuset = fg_cgroup_cpuset.read_cpus() - bg_cpuset = bg_cgroup_cpuset.read_cpus() - self._stored_config = (fg_cpuset, bg_cpuset) - - def load_cur_config(self): - return self._stored_config + self._stored_config = (self._cur_fg_step, self._cur_bg_step) + + def load_cur_config(self) -> None: + super().load_cur_config() + + self._cur_fg_step, self._cur_bg_step = self._stored_config + self._enforce() + self._stored_config = None diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index 6354e08..2998271 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -25,7 +25,7 @@ def weaken(self) -> 'Isolator': def _enforce(self) -> None: pass - def _first_decision(self) -> NextStep: + def _first_decision(self, _) -> NextStep: self._fg_next_step = NextStep.IDLE self._bg_next_step = NextStep.IDLE return NextStep.IDLE @@ -33,7 +33,7 @@ def _first_decision(self) -> NextStep: def decide_next_step(self) -> NextStep: return self._monitoring_result() - def _monitoring_result(self) -> NextStep: + def _monitoring_result(self, **kwargs) -> NextStep: self._fg_next_step = NextStep.IDLE self._bg_next_step = NextStep.IDLE return NextStep.IDLE @@ -44,5 +44,5 @@ def reset(self) -> None: def store_cur_config(self) -> None: pass - def load_cur_config(self): + def load_cur_config(self) -> None: pass diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index 0816d8b..fec1f6d 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -1,10 +1,11 @@ # coding: UTF-8 import logging -from typing import Dict, Tuple +from typing import Optional from .base_isolator import Isolator from .. import NextStep +from ...metric_container.basic_metric import MetricDiff from ...utils import DVFS from ...workload import Workload @@ -17,8 +18,8 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) # FIXME: hard coded - self._cur_step = DVFS.MAX - self._stored_config: Tuple[Dict[int, int], ...] = None + self._cur_step: int = DVFS.MAX + self._stored_config: Optional[int] = None def strengthen(self) -> 'MemoryIsolator': self._cur_step -= DVFS.STEP @@ -44,8 +45,7 @@ def _enforce(self) -> None: DVFS.set_freq(self._cur_step, self._background_wl.bound_cores) - def _first_decision(self) -> NextStep: - metric_diff = self._foreground_wl.calc_metric_diff() + def _first_decision(self, metric_diff: MetricDiff) -> NextStep: curr_diff = metric_diff.local_mem_util_ps logger = logging.getLogger(__name__) @@ -64,11 +64,9 @@ def _first_decision(self) -> NextStep: else: return NextStep.WEAKEN - def _monitoring_result(self) -> NextStep: - metric_diff = self._foreground_wl.calc_metric_diff() - - curr_diff = metric_diff.local_mem_util_ps - prev_diff = self._prev_metric_diff.local_mem_util_ps + def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: + curr_diff = cur_metric_diff.local_mem_util_ps + prev_diff = prev_metric_diff.local_mem_util_ps diff_of_diff = curr_diff - prev_diff logger = logging.getLogger(__name__) @@ -90,11 +88,11 @@ def reset(self) -> None: DVFS.set_freq(DVFS.MAX, self._background_wl.orig_bound_cores) def store_cur_config(self) -> None: - fg_rapl_dvfs = self._foreground_wl.dvfs - bg_rapl_dvfs = self._background_wl.dvfs - fg_dvfs = fg_rapl_dvfs.cpufreq - bg_dvfs = bg_rapl_dvfs.cpufreq - self._stored_config = (fg_dvfs, bg_dvfs) - - def load_cur_config(self): - return self._stored_config + self._stored_config = self._cur_step + + def load_cur_config(self) -> None: + super().load_cur_config() + + self._cur_step = self._stored_config + self._enforce() + self._stored_config = None diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 2af4804..1696dbf 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -2,7 +2,7 @@ import logging from abc import ABCMeta, abstractmethod -from typing import Any, Dict, Type +from typing import Dict, Type from .. import ResourceType from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator @@ -19,14 +19,17 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._fg_wl = fg_wl self._bg_wl = bg_wl - self._isolator_map: Dict[Type[Isolator], Isolator] = dict() + self._isolator_map: Dict[Type[Isolator], Isolator] = dict(( + (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), + (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), + (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)), + )) self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR self._aggr_inst_diff: float = None - self._isolator_configs: Dict[Type[Isolator], Any] = dict() - self._profile_stop_cond: int = None # the count to stop solorun profiling condition - self._thread_changed: bool = False - self._fg_runs_alone: bool = False + + self._in_solorun_profile: bool = False + self._cached_fg_num_threads: int = fg_wl.number_of_threads def __hash__(self) -> int: return id(self) @@ -39,13 +42,6 @@ def __del__(self) -> None: for isolator in isolators: del self._isolator_map[isolator] - def init_isolators(self) -> None: - self._isolator_map = dict(( - (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), - (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), - (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)), - )) - @property @abstractmethod def new_isolator_needed(self) -> bool: @@ -63,7 +59,7 @@ def contentious_resource(self) -> ResourceType: logger.info(repr(metric_diff)) logger.info(f'l3_int: {cur_metric.l3_intensity:>7.04f}, ' f'mem_int: {cur_metric.mem_intensity:>7.04f}, ' - f'llc_util: {cur_metric.l3_util:>7.04f}') + f'l3_util: {cur_metric.l3_util:>7.04f}') if abs(cur_metric.l3_intensity) < IsolationPolicy._CPU_THRESHOLD \ and abs(cur_metric.mem_intensity) < IsolationPolicy._CPU_THRESHOLD: return ResourceType.CPU @@ -124,6 +120,10 @@ def name(self) -> str: def aggr_inst(self) -> float: return self._aggr_inst_diff + @property + def in_solorun_profiling(self) -> bool: + return self._in_solorun_profile + @property def most_cont_workload(self) -> Workload: fg_wl = self.foreground_workload @@ -157,8 +157,8 @@ def least_mem_bw_workload(self) -> Workload: fg_wl = self.foreground_workload bg_wl = self.background_workload - fg_mem_bw = fg_wl.metrics[0].local_mem_ps() - bg_mem_bw = bg_wl.metrics[0].local_mem_ps() + fg_mem_bw = fg_wl.metrics[0].local_mem_ps + bg_mem_bw = bg_wl.metrics[0].local_mem_ps if fg_mem_bw > bg_mem_bw: return bg_wl @@ -181,127 +181,57 @@ def reset(self) -> None: for isolator in self._isolator_map.values(): isolator.reset() - def store_cur_configs(self) -> None: - for isotype, isolator in self._isolator_map.items(): - isolator.store_cur_config() - self._isolator_configs[isotype] = isolator.load_cur_config() + def start_solorun_profiling(self) -> None: + """ profile solorun status of a workload """ + if self._in_solorun_profile: + raise ValueError('Stop the ongoing solorun profiling first!') - def reset_stored_configs(self) -> None: - """ - Reset stored configs - """ - logger = logging.getLogger(__name__) - # Cpuset (Cpuset) - cpuset_config = self._isolator_configs[CoreIsolator] - (fg_cpuset, bg_cpuset) = cpuset_config - self._fg_wl.cgroup_cpuset.assign_cpus(fg_cpuset) - self._bg_wl.cgroup_cpuset.assign_cpus(bg_cpuset) - - # DVFS (Dict(cpuid, freq)) - dvfs_config = self._isolator_configs[MemoryIsolator] - (fg_dvfs_config, bg_dvfs_config) = dvfs_config - fg_cpuset = fg_dvfs_config.keys() - fg_cpufreq = fg_dvfs_config.values() - fg_dvfs = self._fg_wl.dvfs - for fg_cpu in fg_cpuset: - freq = fg_cpufreq[fg_cpu] - fg_dvfs.set_freq(freq, fg_cpu) - - bg_cpuset = bg_dvfs_config.keys() - bg_cpufreq = bg_dvfs_config.values() - bg_dvfs = self._bg_wl.dvfs - for bg_cpu in bg_cpuset: - freq = bg_cpufreq[bg_cpu] - bg_dvfs.set_freq(freq, bg_cpu) - - # ResCtrl (Mask) - resctrl_config = self._isolator_configs[CacheIsolator] - (fg_mask, bg_mask) = resctrl_config - logger.debug(f'fg_mask: {fg_mask}, bg_mask: {bg_mask}') - logger.debug(f'fg_path: {self._fg_wl.resctrl.MOUNT_POINT/self._fg_wl.group_name}') - self._fg_wl.resctrl.assign_llc(*fg_mask) - self._bg_wl.resctrl.assign_llc(*bg_mask) - - def profile_solorun(self) -> None: - """ - profile solorun status of a workload - :return: - """ - # suspend all workloads and their perf agents - all_fg_wls = list() - all_bg_wls = list() - fg_wl = self.foreground_workload - bg_wl = self.background_workload - fg_wl.pause() - bg_wl.pause() - all_fg_wls.append(fg_wl) - all_bg_wls.append(bg_wl) - - # run FG workloads alone - for fg_wl in all_fg_wls: - fg_wl.solorun_data_queue.clear() # clear the prev. solorun data - fg_wl.profile_solorun = True - self.fg_runs_alone = True - fg_wl.resume() - - def _update_all_workloads_num_threads(self): - """ - update the workloads' number of threads (cur_num_threads -> prev_num_threads) - :return: - """ - bg_wl = self.background_workload - fg_wl = self.foreground_workload - bg_wl.update_num_threads() - fg_wl.update_num_threads() + self._in_solorun_profile = True - def profile_needed(self, profile_interval, schedule_interval, count: int) -> bool: - """ - This function checks if the profiling procedure should be called + # suspend all workloads and their perf agents + self._fg_wl.pause() + self._bg_wl.pause() - profile_freq : the frequencies of online profiling - :param profile_interval: the frequency of attempting profiling solorun - :param schedule_interval: the frequency of scheduling (isolation) - :param count: This counts the number of entering the run func. loop - :return: Decision whether to initiate online solorun profiling - """ - logger = logging.getLogger(__name__) - profile_freq = int(profile_interval / schedule_interval) - fg_wl = self.foreground_workload - logger.debug(f'count: {count}, profile_freq: {profile_freq}, ' - f'fg_wl.is_num_threads_changed(): {fg_wl.is_num_threads_changed()}') + self._fg_wl.metrics.clear() - if fg_wl.is_num_threads_changed(): - fg_wl.thread_changed_before = True + # store current configuration + for isolator in self._isolator_map.values(): + isolator.store_cur_config() + isolator.reset() - if count % profile_freq != 0 or not fg_wl.thread_changed_before: - self._update_all_workloads_num_threads() - return False - else: - self._update_all_workloads_num_threads() - fg_wl.thread_changed_before = False - return True + def stop_solorun_profiling(self) -> None: + if not self._in_solorun_profile: + raise ValueError('Start solorun profiling first!') - @property - def profile_stop_cond(self) -> int: - return self._profile_stop_cond + self._fg_wl.pause() - @profile_stop_cond.setter - def profile_stop_cond(self, new_count: int) -> None: - self._profile_stop_cond = new_count + logger = logging.getLogger(__name__) + logger.debug(f'number of collected solorun data: {len(self._fg_wl.metrics)}') + self._fg_wl.avg_solorun_data = BasicMetric.calc_avg(self._fg_wl.metrics) + logger.debug(f'calculated average solorun data: {self._fg_wl.avg_solorun_data}') - def all_workload_pause(self): - self._fg_wl.pause() - self._bg_wl.pause() + self._fg_wl.metrics.clear() - def all_workload_resume(self): + # resume all self._fg_wl.resume() self._bg_wl.resume() - self.fg_runs_alone = False - @property - def fg_runs_alone(self) -> bool: - return self._fg_runs_alone + # restore stored configuration + for isolator in self._isolator_map.values(): + isolator.load_cur_config() + + self._in_solorun_profile = False + + def profile_needed(self) -> bool: + """ + This function checks if the profiling procedure should be called + :return: Decision whether to initiate online solorun profiling + """ + # FIXME: or fg doesn't have solorun data - @fg_runs_alone.setter - def fg_runs_alone(self, new_val) -> None: - self._fg_runs_alone = new_val + cur_num_threads = self._fg_wl.number_of_threads + if self._fg_wl.avg_solorun_data is None or self._cached_fg_num_threads != cur_num_threads: + self._cached_fg_num_threads = cur_num_threads + return True + else: + return False diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index ccd7c16..4c152dc 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -1,8 +1,7 @@ # coding: UTF-8 -from __future__ import division - -from time import localtime, strftime +from statistics import mean +from typing import Iterable from cpuinfo import cpuinfo @@ -10,21 +9,37 @@ class BasicMetric: - def __init__(self, l2miss=0, l3miss=0, inst=0, cycles=0, stall_cycles=0, wall_cycles=0, intra_coh=0, - inter_coh=0, llc_size=0, local_mem=0, remote_mem=0, interval: int = 50): + def __init__(self, l2miss, l3miss, inst, cycles, stall_cycles, wall_cycles, intra_coh, + inter_coh, llc_size, local_mem, remote_mem, interval): self._l2miss = l2miss self._l3miss = l3miss self._instructions = inst - self._wall_cycles = wall_cycles self._cycles = cycles self._stall_cycles = stall_cycles + self._wall_cycles = wall_cycles self._intra_coh = intra_coh self._inter_coh = inter_coh self._llc_size = llc_size self._local_mem = local_mem self._remote_mem = remote_mem self._interval = interval - self._req_date = strftime("%I:%M:%S", localtime()) + + @classmethod + def calc_avg(cls, metrics: Iterable['BasicMetric']) -> 'BasicMetric': + return BasicMetric( + mean(metric._l2miss for metric in metrics), + mean(metric._l3miss for metric in metrics), + mean(metric._instructions for metric in metrics), + mean(metric._cycles for metric in metrics), + mean(metric._stall_cycles for metric in metrics), + mean(metric._wall_cycles for metric in metrics), + mean(metric._intra_coh for metric in metrics), + mean(metric._inter_coh for metric in metrics), + mean(metric._llc_size for metric in metrics), + mean(metric._local_mem for metric in metrics), + mean(metric._remote_mem for metric in metrics), + mean(metric._interval for metric in metrics), + ) @property def l2miss(self): @@ -70,6 +85,7 @@ def llc_size(self): def local_mem(self) -> float: return self._local_mem + @property def local_mem_ps(self) -> float: return self._local_mem * (1000 / self._interval) @@ -77,13 +93,10 @@ def local_mem_ps(self) -> float: def remote_mem(self): return self._remote_mem + @property def remote_mem_ps(self) -> float: return self._remote_mem * (1000 / self._interval) - @property - def req_date(self): - return self._req_date - @property def ipc(self) -> float: return self._instructions / self._cycles @@ -109,61 +122,28 @@ def l3hit_ratio(self) -> float: return 1 - self._l3miss / self._l2miss @property - def llc_util(self) -> float: + def l3_util(self) -> float: return self._llc_size / LLC_SIZE @property def l3_intensity(self) -> float: - return self.llc_util * self.l3hit_ratio + return self.l3_util * self.l3hit_ratio @property def mem_intensity(self) -> float: - return self.llc_util * self.l3miss_ratio - - @property - def l3_util(self) -> float: - return self.llc_util + return self.l3_util * self.l3miss_ratio def __repr__(self) -> str: return ', '.join(map(str, ( self._l2miss, self._l3miss, self._instructions, self._cycles, self._stall_cycles, self._wall_cycles, - self._intra_coh, self._inter_coh, self._llc_size, self._local_mem, self._remote_mem, - self._interval, self._req_date))) - - def __iadd__(self, others): - self._l2miss = self.l2miss + others.l2miss - self._l3miss = self.l3miss + others.l3miss - self._instructions = self.instruction + others.instruction - self._cycles = self._cycles + others.cycles - self._stall_cycles = self.stall_cycle + others.stall_cycle - self._wall_cycles = self.wall_cycles + others.wall_cycles - self._intra_coh = self.intra_coh + others.intra_coh - self._inter_coh = self.inter_coh + others.inter_coh - self._llc_size = self.llc_size + others.llc_size - self._local_mem = self.local_mem + others.local_mem - self._remote_mem = self.remote_mem + others.remote_mem - return self - - def __truediv__(self, other: int): - self._l2miss /= other - self._l3miss /= other - self._instructions /= other - self._cycles /= other - self._stall_cycles /= other - self._wall_cycles /= other - self._intra_coh /= other - self._inter_coh /= other - self._llc_size /= other - self._local_mem /= other - self._remote_mem /= other - return self + self._intra_coh, self._inter_coh, self._llc_size, self._local_mem, self._remote_mem, self._interval))) class MetricDiff: def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: self._l3_hit_ratio = curr.l3hit_ratio - prev.l3hit_ratio - self._local_mem_ps = curr.local_mem_ps() / prev.local_mem_ps() - 1 - self._remote_mem_ps = curr.remote_mem_ps() / prev.remote_mem_ps() - 1 + self._local_mem_ps = curr.local_mem_ps / prev.local_mem_ps - 1 + self._remote_mem_ps = curr.remote_mem_ps / prev.remote_mem_ps - 1 self._instruction_ps = curr.instruction_ps / prev.instruction_ps - 1 @property diff --git a/isolating_controller/solorun_data/__init__.py b/isolating_controller/solorun_data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/isolating_controller/solorun_data/canneal.json b/isolating_controller/solorun_data/canneal.json deleted file mode 120000 index 66f4590..0000000 --- a/isolating_controller/solorun_data/canneal.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/canneal.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/datas.py b/isolating_controller/solorun_data/datas.py deleted file mode 100644 index 5e5693b..0000000 --- a/isolating_controller/solorun_data/datas.py +++ /dev/null @@ -1,33 +0,0 @@ -# coding: UTF-8 - -import json -from pathlib import Path -from typing import Dict - -from ..metric_container.basic_metric import BasicMetric - -data_map: Dict[str, BasicMetric] = dict() - - -def _init() -> None: - for data in Path(__file__).parent.iterdir(): # type: Path - if data.match('*.json'): - metric = json.loads(data.read_text()) - - item = BasicMetric(metric['l2miss'], - metric['l3miss'], - metric['instructions'], - metric['cycles'], - metric['stall_cycles'], - metric['wall_cycles'], - metric['intra_coh'], - metric['inter_coh'], - metric['llc_size'], - metric['local_mem'], - metric['remote_mem'], - 1000) - - data_map[metric['name']] = item - - -_init() diff --git a/isolating_controller/solorun_data/facesim.json b/isolating_controller/solorun_data/facesim.json deleted file mode 120000 index 0239831..0000000 --- a/isolating_controller/solorun_data/facesim.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/facesim.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/fluidanimate.json b/isolating_controller/solorun_data/fluidanimate.json deleted file mode 120000 index 0802176..0000000 --- a/isolating_controller/solorun_data/fluidanimate.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/fluidanimate.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/freqmine.json b/isolating_controller/solorun_data/freqmine.json deleted file mode 120000 index 6fd79ad..0000000 --- a/isolating_controller/solorun_data/freqmine.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/freqmine.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/kmeans.json b/isolating_controller/solorun_data/kmeans.json deleted file mode 120000 index d54f4be..0000000 --- a/isolating_controller/solorun_data/kmeans.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/kmeans.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/nn.json b/isolating_controller/solorun_data/nn.json deleted file mode 120000 index 843df3a..0000000 --- a/isolating_controller/solorun_data/nn.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/nn.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/particlefilter.json b/isolating_controller/solorun_data/particlefilter.json deleted file mode 120000 index 4c464ab..0000000 --- a/isolating_controller/solorun_data/particlefilter.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/particlefilter.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/raytrace.json b/isolating_controller/solorun_data/raytrace.json deleted file mode 120000 index 0c38ca2..0000000 --- a/isolating_controller/solorun_data/raytrace.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/raytrace.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/sp.json b/isolating_controller/solorun_data/sp.json deleted file mode 120000 index 6abde0e..0000000 --- a/isolating_controller/solorun_data/sp.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/sp.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/streamcluster.json b/isolating_controller/solorun_data/streamcluster.json deleted file mode 120000 index 7d1df85..0000000 --- a/isolating_controller/solorun_data/streamcluster.json +++ /dev/null @@ -1 +0,0 @@ -../../solorun_data/8core/streamcluster.json \ No newline at end of file diff --git a/isolating_controller/utils/__init__.py b/isolating_controller/utils/__init__.py index 9526532..9b40f6a 100644 --- a/isolating_controller/utils/__init__.py +++ b/isolating_controller/utils/__init__.py @@ -1,5 +1,4 @@ # coding: UTF-8 -from .cat import CAT from .dvfs import DVFS from .resctrl import ResCtrl diff --git a/isolating_controller/utils/cat.py b/isolating_controller/utils/cat.py deleted file mode 100644 index d44d12c..0000000 --- a/isolating_controller/utils/cat.py +++ /dev/null @@ -1,57 +0,0 @@ -# coding: UTF-8 - -import os -import subprocess -from pathlib import Path -from typing import Iterable, Optional - - -def len_of_mask(mask: str) -> int: - cnt = 0 - num = int(mask, 16) - while num is not 0: - cnt += 1 - num >>= 1 - return cnt - - -class CAT: - MOUNT_POINT = Path('/sys/fs/resctrl') - - MIN = int((MOUNT_POINT / 'info' / 'L3' / 'min_cbm_bits').read_text()) - STEP = 1 - MAX = len_of_mask((MOUNT_POINT / 'info' / 'L3' / 'cbm_mask').read_text()) - - @staticmethod - def create_group(name: str) -> None: - subprocess.check_call(args=('sudo', 'mkdir', '-p', str(CAT.MOUNT_POINT / name))) - - @staticmethod - def add_task(name: str, pid: int) -> None: - subprocess.run(args=('sudo', 'tee', '-a', str(CAT.MOUNT_POINT / name / 'tasks')), - input=f'{pid}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) - - @staticmethod - def remove_group(name: str) -> None: - subprocess.check_call(args=('sudo', 'rmdir', str(CAT.MOUNT_POINT / name))) - - @staticmethod - def assign(name: str, *masks: Iterable[str]) -> None: - masks = (f'{i}={mask}' for i, mask in enumerate(masks)) - mask = ';'.join(masks) - subprocess.run(args=('sudo', 'tee', str(CAT.MOUNT_POINT / name / 'schemata')), - input=f'L3:{mask}\n', check=True, encoding='ASCII', stdout=subprocess.DEVNULL) - - @staticmethod - def gen_mask(start: int, end: Optional[int] = None) -> str: - if end is None or end > CAT.MAX: - end = CAT.MAX - - if start < 0: - raise ValueError('start must be greater than 0') - - return format(((1 << (end - start)) - 1) << (CAT.MAX - end), 'x') - - -if not os.path.ismount(str(CAT.MOUNT_POINT)): - subprocess.check_call(args=('sudo', 'mount', '-t', 'resctrl', 'resctrl', str(CAT.MOUNT_POINT))) diff --git a/isolating_controller/utils/cgroup/cpuset.py b/isolating_controller/utils/cgroup/cpuset.py index b82be82..086e2c0 100644 --- a/isolating_controller/utils/cgroup/cpuset.py +++ b/isolating_controller/utils/cgroup/cpuset.py @@ -9,7 +9,6 @@ class CpuSet(BaseCgroup): - MOUNT_POINT = '/sys/fs/cgroup/cpuset' CONTROLLER = 'cpuset' def assign_cpus(self, core_set: Iterable[int]) -> None: @@ -34,15 +33,3 @@ def read_mems(self) -> Set[int]: if mems is '': raise ProcessLookupError() return convert_to_set(mems) - - def get_cpu_affinity_from_group(self) -> Set[int]: - with open(f'{CpuSet.MOUNT_POINT}/{self._group_name}/cpuset.cpus', "r") as fp: - line: str = fp.readline() - core_set: Set[int] = convert_to_set(line) - return core_set - - def get_mem_affinity_from_group(self) -> Set[int]: - with open(f'{CpuSet.MOUNT_POINT}/{self._group_name}/cpuset.mems', "r") as fp: - line: str = fp.readline() - mem_set: Set[int] = convert_to_set(line) - return mem_set diff --git a/isolating_controller/utils/dvfs.py b/isolating_controller/utils/dvfs.py index 77b7a85..21c0f2f 100644 --- a/isolating_controller/utils/dvfs.py +++ b/isolating_controller/utils/dvfs.py @@ -1,7 +1,6 @@ # coding: UTF-8 import subprocess -from itertools import chain from pathlib import Path from typing import Dict, Iterable @@ -27,8 +26,7 @@ def set_freq_cgroup(self, target_freq: int): :param target_freq: freq. to set to cgroup cpuset :return: """ - cur_grp_cpuset = self._cur_cgroup.get_cpu_affinity_from_group() - DVFS.set_freq(target_freq, chain(cur_grp_cpuset)) + DVFS.set_freq(target_freq, self._cur_cgroup.read_cpus()) @property def cpufreq(self) -> Dict[int, int]: diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index 773a6b8..1ae14be 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -3,7 +3,7 @@ import re import subprocess from pathlib import Path -from typing import List, Tuple +from typing import List, Pattern, Tuple def len_of_mask(mask: str) -> int: @@ -26,7 +26,7 @@ class ResCtrl: MIN_BITS: int = int((MOUNT_POINT / 'info' / 'L3' / 'min_cbm_bits').read_text()) MIN_MASK: str = bits_to_mask(MIN_BITS) STEP = 1 - _read_regex: re = re.compile(r'L3:((\d+=[0-9a-fA-F]+;?)*)', re.MULTILINE) + _read_regex: Pattern = re.compile(r'L3:((\d+=[0-9a-fA-F]+;?)*)', re.MULTILINE) def __init__(self, group_name: str) -> None: self._group_name: str = group_name diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index d509c1f..bf10e65 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -3,12 +3,11 @@ import logging from collections import deque from itertools import chain -from typing import Deque, Iterable, Set, Tuple +from typing import Deque, Iterable, Optional, Set, Tuple import psutil from .metric_container.basic_metric import BasicMetric, MetricDiff -from .solorun_data.datas import data_map from .utils import DVFS, ResCtrl, numa_topology from .utils.cgroup import Cpu, CpuSet @@ -37,12 +36,8 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._resctrl = ResCtrl(self.group_name) self._dvfs = DVFS(self.group_name) - self._profile_solorun: bool = False - self._solorun_data_queue: Deque[ - BasicMetric] = deque() # This queue is used to collect and calculate avg. status - self._avg_solorun_data: BasicMetric = None # This variable is used to contain the recent avg. status - self._prev_num_threads: int = None - self._thread_changed_before: bool = False + # This variable is used to contain the recent avg. status + self._avg_solorun_data: Optional[BasicMetric] = None self._orig_bound_cores: Tuple[int, ...] = tuple(self._cgroup_cpuset.read_cpus()) self._orig_bound_mems: Set[int] = self._cgroup_cpuset.read_mems() @@ -141,65 +136,21 @@ def number_of_threads(self) -> int: return 0 @property - def prev_num_threads(self) -> int: - return self._prev_num_threads - - @property - def thread_changed_before(self) -> bool: - return self._thread_changed_before - - @thread_changed_before.setter - def thread_changed_before(self, new_val) -> None: - self._thread_changed_before = new_val - - def update_num_threads(self) -> None: - try: - self._prev_num_threads = self._proc_info.num_threads() - except psutil.NoSuchProcess: - self._prev_num_threads = 0 - - @property - def profile_solorun(self) -> bool: - return self._profile_solorun - - @profile_solorun.setter - def profile_solorun(self, new_flag: bool) -> None: - self._profile_solorun = new_flag - - @property - def solorun_data_queue(self) -> Deque[BasicMetric]: - return self._solorun_data_queue - - @property - def avg_solorun_data(self) -> BasicMetric: + def avg_solorun_data(self) -> Optional[BasicMetric]: return self._avg_solorun_data - def calc_avg_solorun(self) -> None: - logger = logging.getLogger(__name__) - counts = 0 - sum_of_items = BasicMetric(interval=50) - for item in self.solorun_data_queue: - logger.debug(f'item in solorun_data_queue : {item}') - sum_of_items += item - logger.debug(f'sum_of_items[{counts}] : {sum_of_items}') - counts += 1 - logger.debug(f'self.solorun_data_queue : {self.solorun_data_queue}') - logger.debug(f'after sum, sum_of_items : {sum_of_items}') - self._avg_solorun_data = sum_of_items / counts - logger.debug(f'after truediv, truediv_of_items : {self._avg_solorun_data}') + @avg_solorun_data.setter + def avg_solorun_data(self, new_data: BasicMetric) -> None: + self._avg_solorun_data = new_data def calc_metric_diff(self) -> MetricDiff: logger = logging.getLogger(__name__) - # solorun_data = data_map[self.name] - if self._avg_solorun_data is not None: - solorun_data = self._avg_solorun_data - else: - solorun_data = data_map[self.name] + curr_metric: BasicMetric = self._metrics[0] - logger.debug(f'solorun_data L3 hit ratio: {solorun_data.l3hit_ratio}, ' - f'Local Mem BW ps : {solorun_data.local_mem_ps()}, ' - f'Instruction ps. : {solorun_data.instruction_ps}') - return MetricDiff(curr_metric, solorun_data) + logger.debug(f'solorun_data L3 hit ratio: {self._avg_solorun_data.l3hit_ratio}, ' + f'Local Mem BW ps : {self._avg_solorun_data.local_mem_ps}, ' + f'Instruction ps. : {self._avg_solorun_data.instruction_ps}') + return MetricDiff(curr_metric, self._avg_solorun_data) def all_child_tid(self) -> Tuple[int, ...]: try: @@ -226,14 +177,3 @@ def pause(self) -> None: def resume(self) -> None: self._proc_info.resume() self._perf_info.resume() - - def is_num_threads_changed(self) -> bool: - """ - Detecting the phase changes based on the changes in the number of threads - :return: - """ - cur_num_threads = self.number_of_threads - if self._prev_num_threads == cur_num_threads: - return False - else: - return True diff --git a/proc_arbitrator.py b/proc_arbitrator.py deleted file mode 100644 index db032ad..0000000 --- a/proc_arbitrator.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python -# coding=UTF-8 - -from __future__ import division, print_function - -import multiprocessing -import os -import sys -import time -from signal import SIGCONT, SIGSTOP -from threading import Timer - -import psutil - -dead_status = (psutil.STATUS_DEAD, psutil.STATUS_ZOMBIE) - - -class ProcessArbitrator: - def __init__(self, pids, time_slice=50, iteration_limit=None): - """ - 생성자 - - Args: - pids (list of tuple): (실행할 process의 pid, perf의 pid)들의 list - time_slice (int): 한 process가 실행될 interval (ms 단위). 기본값은 500 - iteration_limit (int): 각 pid들의 반복횟수를 제한. `None` 으로하면 무제한. 기본값은 `None` - - Raises: - ValueError: pid의 타입이 이상할 경우 - - Notes - `time_slice` 가 0일경우 제대로 동작안함 (중요하지 않아보여서 처리하지 않음) - """ - if not pids: - raise ValueError('`pids` cannot be `None`') - - self._iteration_limit = iteration_limit - self._time_slice = time_slice - self._all_pids = list(pids) - self._remain_pids = list(pids) - self.next_proc() - - def next_proc(self): - # self._print_status() - - self._stop_all() - # print 'all process stopped' - - next_pid = self.pick_next_proc() - - if not next_pid: - # print 'no more process to run' - return - - # print 'next process is : ' + str(next_pid) - if next_pid[0] is not None: - os.kill(next_pid[0], SIGCONT) - - if next_pid[1] is not None: - os.kill(next_pid[1], SIGCONT) - - Timer(self._time_slice / 1000, self.next_proc).start() - - def pick_next_proc(self): - """ - `ProcessArbitrator` 에 포함된 process중에서 다음 time slice때 실행될 process의 pid를 구한다. - 더이상 실행할 process가 없을 때, 혹은 `iteration_limit` 에 도달했을때 `None` 을 반환한다. - - Returns: - tuple of int: 다음 time slice에 실행할 process의 pid - """ - while True: - if len(self._remain_pids) is 0: - if len(self._all_pids) is 0: - return None - - elif self._iteration_limit is 1: - self._resume_all() - return None - - else: - self._remain_pids.extend(self._all_pids) - if self._iteration_limit: - self._iteration_limit -= 1 - - next_pid = self._remain_pids.pop() - - is_ps1_dead = False - is_ps2_dead = False - - try: - if psutil.Process(next_pid[0]).status() in dead_status: - is_ps1_dead = True - except psutil.NoSuchProcess: - is_ps1_dead = True - - try: - if psutil.Process(next_pid[1]).status() in dead_status: - is_ps2_dead = True - except psutil.NoSuchProcess: - is_ps2_dead = True - - if is_ps1_dead and not is_ps2_dead: - return None, next_pid[1] - - elif not is_ps1_dead and is_ps2_dead: - return next_pid[0], None - - elif not is_ps1_dead and not is_ps2_dead: - return next_pid - - else: - self._all_pids.remove(next_pid) - - def set_time_slice(self, time_slice): - self._time_slice = time_slice - - def _stop_all(self): - try: - for pid in self._all_pids: - os.kill(pid[0], SIGSTOP) - os.kill(pid[1], SIGSTOP) - except: - pass - - def _resume_all(self): - try: - for pid in self._all_pids: - os.kill(pid[0], SIGCONT) - os.kill(pid[1], SIGCONT) - except: - pass - - def _print_status(self): - for pid in self._all_pids: - try: - process = psutil.Process(pid[0]) - sys.stdout.write(str(process.pid) + ':' + process.status() + ', ') - except psutil.NoSuchProcess: - pass - print() - - -def main(): - num = 4 - from datetime import datetime - - def test_thread(name): - for i in range(num): - time.sleep(1) - sys.stderr.write('{}\t{}({})\t{}\n'.format(datetime.now(), name, os.getpid(), i)) - - return - - processes = [] - - try: - proc_num = 2 - - pids = [] - - for n in range(proc_num): - process = multiprocessing.Process(target=test_thread, args=('process #' + str(n),)) - process.start() - process2 = multiprocessing.Process(target=test_thread, args=('process #' + str(n) + '\'s sidekick',)) - process2.start() - pids.append((process.pid, process2.pid)) - - processes.append(process) - processes.append(process2) - - ProcessArbitrator(pids, 50) - - for process in processes: - print('start to join {0}'.format(process.pid)) - process.join() - print('end of {0}'.format(process.pid)) - - except KeyboardInterrupt: - for process in processes: - process.terminate() - - -if __name__ == '__main__': - main() From ca3b60b774fdb0b1d8e29352d5b6737a99216460 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 10 Oct 2018 22:10:19 +0900 Subject: [PATCH 41/82] fixes missing resume statement on base_policy.py --- isolating_controller/isolation/policies/base_policy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 1696dbf..f2fb762 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -199,6 +199,8 @@ def start_solorun_profiling(self) -> None: isolator.store_cur_config() isolator.reset() + self._fg_wl.resume() + def stop_solorun_profiling(self) -> None: if not self._in_solorun_profile: raise ValueError('Start solorun profiling first!') From 1161be19feb57136ff036e276ab383e6f56b4a29 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 10 Oct 2018 22:11:22 +0900 Subject: [PATCH 42/82] implement updated method to SchedIsolator --- .../isolation/isolators/schedule.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index 34d08e5..aa29777 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -1,9 +1,11 @@ # coding: UTF-8 import logging +from typing import Optional from .base_isolator import Isolator from .. import NextStep +from ...metric_container.basic_metric import MetricDiff from ...workload import Workload @@ -17,6 +19,8 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: # FIXME: hard coded self._cur_step = background_wl.orig_bound_cores[0] + self._stored_config: Optional[int] = None + def strengthen(self) -> 'SchedIsolator': self._cur_step += 1 return self @@ -43,8 +47,7 @@ def _enforce(self) -> None: # FIXME: hard coded self._background_wl.bound_cores = range(self._cur_step, self._background_wl.orig_bound_cores[-1] + 1) - def _first_decision(self) -> NextStep: - metric_diff = self._foreground_wl.calc_metric_diff() + def _first_decision(self, metric_diff: MetricDiff) -> NextStep: curr_diff = metric_diff.local_mem_util_ps logger = logging.getLogger(__name__) @@ -63,11 +66,9 @@ def _first_decision(self) -> NextStep: else: return NextStep.WEAKEN - def _monitoring_result(self) -> NextStep: - metric_diff = self._foreground_wl.calc_metric_diff() - - curr_diff = metric_diff.local_mem_util_ps - prev_diff = self._prev_metric_diff.local_mem_util_ps + def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: + curr_diff = cur_metric_diff.local_mem_util_ps + prev_diff = prev_metric_diff.local_mem_util_ps diff_of_diff = curr_diff - prev_diff logger = logging.getLogger(__name__) @@ -89,3 +90,13 @@ def _monitoring_result(self) -> NextStep: def reset(self) -> None: if self._background_wl.is_running: self._background_wl.bound_cores = self._background_wl.orig_bound_cores + + def store_cur_config(self) -> None: + self._stored_config = self._cur_step + + def load_cur_config(self) -> None: + super().load_cur_config() + + self._cur_step = self._stored_config + self._enforce() + self._stored_config = None From 9d7ffbbf88f45ec0c25176db2cae2771868805ac Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Thu, 11 Oct 2018 11:59:02 +0900 Subject: [PATCH 43/82] reduce invocation of Workload.calc_metric_diff() --- .../isolation/isolators/base_isolator.py | 17 +++++++++-------- .../isolation/isolators/cache.py | 3 +-- .../isolation/isolators/core.py | 3 +-- .../isolation/isolators/idle.py | 2 +- .../isolation/isolators/memory.py | 3 +-- .../isolation/policies/base_policy.py | 10 ++++++---- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base_isolator.py index a0aebdb..017aa33 100644 --- a/isolating_controller/isolation/isolators/base_isolator.py +++ b/isolating_controller/isolation/isolators/base_isolator.py @@ -58,14 +58,9 @@ def weaken(self) -> 'Isolator': pass @abstractmethod - def _enforce(self) -> None: - pass - def enforce(self) -> None: """Actually applies the isolation parameter that set on the current object""" - self._prev_metric_diff: MetricDiff = self._foreground_wl.calc_metric_diff() - - self._enforce() + pass def yield_isolation(self) -> None: """ @@ -83,12 +78,18 @@ def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: Metr pass def decide_next_step(self) -> NextStep: + curr_metric_diff = self._foreground_wl.calc_metric_diff() + if self._is_first_decision: self._is_first_decision = False - return self._first_decision(self._foreground_wl.calc_metric_diff()) + next_step = self._first_decision(curr_metric_diff) else: - return self._monitoring_result(self._prev_metric_diff, self._foreground_wl.calc_metric_diff()) + next_step = self._monitoring_result(self._prev_metric_diff, curr_metric_diff) + + self._prev_metric_diff = curr_metric_diff + + return next_step @abstractmethod def reset(self) -> None: diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 7e7ad30..9c6ed8a 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -53,7 +53,7 @@ def is_min_level(self) -> bool: # FIXME: hard coded return self._cur_step is None or self._cur_step - ResCtrl.STEP <= ResCtrl.MIN_BITS - def _enforce(self) -> None: + def enforce(self) -> None: logger = logging.getLogger(__name__) if self._cur_step is None: @@ -132,5 +132,4 @@ def load_cur_config(self) -> None: super().load_cur_config() self._prev_step, self._cur_step = self._stored_config - self._enforce() self._stored_config = None diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 652ad33..d21a29e 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -69,7 +69,7 @@ def is_min_level(self) -> bool: return self._cur_bg_step == self._background_wl.orig_bound_cores[0] and \ self._cur_fg_step == self._foreground_wl.orig_bound_cores[-1] - def _enforce(self) -> None: + def enforce(self) -> None: logger = logging.getLogger(__name__) logger.debug(f'fg affinity : {self._foreground_wl.orig_bound_cores[0]}-{self._cur_fg_step}') logger.debug(f'bg affinity : {self._cur_bg_step}-{self._background_wl.orig_bound_cores[-1]}') @@ -234,5 +234,4 @@ def load_cur_config(self) -> None: super().load_cur_config() self._cur_fg_step, self._cur_bg_step = self._stored_config - self._enforce() self._stored_config = None diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index 2998271..5b3e24d 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -22,7 +22,7 @@ def is_min_level(self) -> bool: def weaken(self) -> 'Isolator': pass - def _enforce(self) -> None: + def enforce(self) -> None: pass def _first_decision(self, _) -> NextStep: diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index fec1f6d..a4770c4 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -39,7 +39,7 @@ def is_min_level(self) -> bool: # FIXME: hard coded return DVFS.MAX <= self._cur_step + DVFS.STEP - def _enforce(self) -> None: + def enforce(self) -> None: logger = logging.getLogger(__name__) logger.info(f'frequency of bound_cores {self._background_wl.bound_cores} is {self._cur_step / 1_000_000}GHz') @@ -94,5 +94,4 @@ def load_cur_config(self) -> None: super().load_cur_config() self._cur_step = self._stored_config - self._enforce() self._stored_config = None diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index f2fb762..e4fb66b 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -212,16 +212,18 @@ def stop_solorun_profiling(self) -> None: self._fg_wl.avg_solorun_data = BasicMetric.calc_avg(self._fg_wl.metrics) logger.debug(f'calculated average solorun data: {self._fg_wl.avg_solorun_data}') + logger.debug('Enforcing restored configuration...') + # restore stored configuration + for isolator in self._isolator_map.values(): + isolator.load_cur_config() + isolator.enforce() + self._fg_wl.metrics.clear() # resume all self._fg_wl.resume() self._bg_wl.resume() - # restore stored configuration - for isolator in self._isolator_map.values(): - isolator.load_cur_config() - self._in_solorun_profile = False def profile_needed(self) -> bool: From d97ef4bff654055bb78df994456e439a38f6fdc3 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Thu, 11 Oct 2018 12:40:03 +0900 Subject: [PATCH 44/82] set IdleIsolator right after solorun done --- controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/controller.py b/controller.py index 2dfa42f..f708f24 100755 --- a/controller.py +++ b/controller.py @@ -174,6 +174,7 @@ def _isolate_workloads(self) -> None: elif iteration_num % int(self._profile_interval / self._interval) == 0 and group.profile_needed(): group.start_solorun_profiling() self._solorun_count[group] = iteration_num + group.set_idle_isolator() logger.info('skipping isolation because of solorun profiling...') continue From bedf248a44312d966f412e54c72436bd072e8e71 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Thu, 11 Oct 2018 12:40:27 +0900 Subject: [PATCH 45/82] clean up utils/dvfs.py --- isolating_controller/utils/dvfs.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/isolating_controller/utils/dvfs.py b/isolating_controller/utils/dvfs.py index 21c0f2f..6ee1f88 100644 --- a/isolating_controller/utils/dvfs.py +++ b/isolating_controller/utils/dvfs.py @@ -2,7 +2,7 @@ import subprocess from pathlib import Path -from typing import Dict, Iterable +from typing import Iterable from isolating_controller.utils.cgroup import CpuSet @@ -15,10 +15,6 @@ class DVFS: def __init__(self, group_name): self._group_name: str = group_name self._cur_cgroup = CpuSet(self._group_name) - self._cpufreq: Dict[int, int] = dict() - - # FIXME: hard coded to max freq. - self.set_freq_cgroup(DVFS.MAX) def set_freq_cgroup(self, target_freq: int): """ @@ -28,19 +24,6 @@ def set_freq_cgroup(self, target_freq: int): """ DVFS.set_freq(target_freq, self._cur_cgroup.read_cpus()) - @property - def cpufreq(self) -> Dict[int, int]: - """ - Return the cpufreq info - :return: _cpufreq is dict. key:val = cpu_id:cpu_freq - """ - return self._cpufreq - - def save_freq(self, freq: int): - cpuset = self._cpufreq.keys() - for cpu_id in cpuset: - self._cpufreq[cpu_id] = freq - @staticmethod def set_freq(freq: int, cores: Iterable[int]) -> None: """ From d00cb0b06178204457dfdce15b632d18bceda5cd Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Thu, 11 Oct 2018 12:40:44 +0900 Subject: [PATCH 46/82] cleanup isolators/core.ppy --- .../isolation/isolators/core.py | 24 +++---------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index d21a29e..ecce49a 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -138,10 +138,11 @@ def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: Metr return self._strengthen_condition(cur_metric_diff.instruction_ps) def _weaken_condition(self, fg_instruction_ps: float) -> NextStep: - fg_not_used_cores = len(self._foreground_wl.bound_cores) - self._foreground_wl.number_of_threads # BG Next Step Decision # ResourceType.CPU - If FG workload not fully use all its assigned cores..., then BG can weaken! if self._contentious_resource == ResourceType.CPU: + fg_not_used_cores = len(self._foreground_wl.bound_cores) - self._foreground_wl.number_of_threads + if fg_not_used_cores == 0: self._bg_next_step = NextStep.IDLE elif fg_not_used_cores > 0: @@ -178,6 +179,7 @@ def _strengthen_condition(self, fg_instruction_ps: float) -> NextStep: elif fg_instruction_ps <= self._INST_PS_THRESHOLD and \ self._foreground_wl.number_of_threads > len(self._foreground_wl.bound_cores): self._bg_next_step = NextStep.STRENGTHEN + # ResourceType.MEMORY - If BG workload can strengthen its cores... , then strengthen BG's cores! elif self._contentious_resource == ResourceType.MEMORY: if self._cur_bg_step == self._background_wl.orig_bound_cores[-1]: @@ -207,26 +209,6 @@ def reset(self) -> None: if self._foreground_wl.is_running: self._foreground_wl.bound_cores = self._foreground_wl.orig_bound_cores - @staticmethod - def _is_more_core_benefit(wl: Workload) -> bool: - wl_threads = wl.number_of_threads - wl_cpus = len(wl.cgroup_cpuset.read_cpus()) - print(f'{wl.wl_type}, {wl.name}, threads : {wl_threads}, len(cpuset): {wl_cpus}') - if wl_threads > wl_cpus: - return True - else: - return False - - @staticmethod - def _is_less_core_benefit(wl: Workload) -> bool: - wl_threads = wl.number_of_threads - wl_cpus = len(wl.cgroup_cpuset.read_cpus()) - print(f'{wl.wl_type}, {wl.name}, threads : {wl_threads}, len(cpuset): {wl_cpus}') - if wl_threads < wl_cpus: - return True - else: - return False - def store_cur_config(self) -> None: self._stored_config = (self._cur_fg_step, self._cur_bg_step) From 6e71c3848ea2a1669faf33b7487c37ce5cb71d9e Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Thu, 11 Oct 2018 12:42:34 +0900 Subject: [PATCH 47/82] implement updated method to SchedIsolator --- isolating_controller/isolation/isolators/schedule.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index aa29777..d0bd76f 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -39,7 +39,7 @@ def is_min_level(self) -> bool: # FIXME: hard coded return self._cur_step == self._background_wl.orig_bound_cores[0] - def _enforce(self) -> None: + def enforce(self) -> None: logger = logging.getLogger(__name__) # FIXME: hard coded logger.info(f'affinity of background is {self._cur_step}-{self._background_wl.orig_bound_cores[-1]}') @@ -98,5 +98,4 @@ def load_cur_config(self) -> None: super().load_cur_config() self._cur_step = self._stored_config - self._enforce() self._stored_config = None From 13ca1b0745778b2ba84bec61ad212cd30ba2d988 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Thu, 11 Oct 2018 14:43:22 +0900 Subject: [PATCH 48/82] restore offline solorun data and use it for background workload --- isolating_controller/solorun_data/__init__.py | 0 .../solorun_data/canneal.json | 1 + isolating_controller/solorun_data/datas.py | 33 +++++++++++++++++++ .../solorun_data/facesim.json | 1 + .../solorun_data/fluidanimate.json | 1 + .../solorun_data/freqmine.json | 1 + isolating_controller/solorun_data/kmeans.json | 1 + isolating_controller/solorun_data/nn.json | 1 + .../solorun_data/particlefilter.json | 1 + .../solorun_data/raytrace.json | 1 + isolating_controller/solorun_data/sp.json | 1 + .../solorun_data/streamcluster.json | 1 + isolating_controller/workload.py | 4 +++ 13 files changed, 47 insertions(+) create mode 100644 isolating_controller/solorun_data/__init__.py create mode 100644 isolating_controller/solorun_data/canneal.json create mode 100644 isolating_controller/solorun_data/datas.py create mode 100644 isolating_controller/solorun_data/facesim.json create mode 100644 isolating_controller/solorun_data/fluidanimate.json create mode 100644 isolating_controller/solorun_data/freqmine.json create mode 100644 isolating_controller/solorun_data/kmeans.json create mode 100644 isolating_controller/solorun_data/nn.json create mode 100644 isolating_controller/solorun_data/particlefilter.json create mode 100644 isolating_controller/solorun_data/raytrace.json create mode 100644 isolating_controller/solorun_data/sp.json create mode 100644 isolating_controller/solorun_data/streamcluster.json diff --git a/isolating_controller/solorun_data/__init__.py b/isolating_controller/solorun_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/isolating_controller/solorun_data/canneal.json b/isolating_controller/solorun_data/canneal.json new file mode 100644 index 0000000..66f4590 --- /dev/null +++ b/isolating_controller/solorun_data/canneal.json @@ -0,0 +1 @@ +../../solorun_data/8core/canneal.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/datas.py b/isolating_controller/solorun_data/datas.py new file mode 100644 index 0000000..5e5693b --- /dev/null +++ b/isolating_controller/solorun_data/datas.py @@ -0,0 +1,33 @@ +# coding: UTF-8 + +import json +from pathlib import Path +from typing import Dict + +from ..metric_container.basic_metric import BasicMetric + +data_map: Dict[str, BasicMetric] = dict() + + +def _init() -> None: + for data in Path(__file__).parent.iterdir(): # type: Path + if data.match('*.json'): + metric = json.loads(data.read_text()) + + item = BasicMetric(metric['l2miss'], + metric['l3miss'], + metric['instructions'], + metric['cycles'], + metric['stall_cycles'], + metric['wall_cycles'], + metric['intra_coh'], + metric['inter_coh'], + metric['llc_size'], + metric['local_mem'], + metric['remote_mem'], + 1000) + + data_map[metric['name']] = item + + +_init() diff --git a/isolating_controller/solorun_data/facesim.json b/isolating_controller/solorun_data/facesim.json new file mode 100644 index 0000000..0239831 --- /dev/null +++ b/isolating_controller/solorun_data/facesim.json @@ -0,0 +1 @@ +../../solorun_data/8core/facesim.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/fluidanimate.json b/isolating_controller/solorun_data/fluidanimate.json new file mode 100644 index 0000000..0802176 --- /dev/null +++ b/isolating_controller/solorun_data/fluidanimate.json @@ -0,0 +1 @@ +../../solorun_data/8core/fluidanimate.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/freqmine.json b/isolating_controller/solorun_data/freqmine.json new file mode 100644 index 0000000..6fd79ad --- /dev/null +++ b/isolating_controller/solorun_data/freqmine.json @@ -0,0 +1 @@ +../../solorun_data/8core/freqmine.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/kmeans.json b/isolating_controller/solorun_data/kmeans.json new file mode 100644 index 0000000..d54f4be --- /dev/null +++ b/isolating_controller/solorun_data/kmeans.json @@ -0,0 +1 @@ +../../solorun_data/8core/kmeans.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/nn.json b/isolating_controller/solorun_data/nn.json new file mode 100644 index 0000000..843df3a --- /dev/null +++ b/isolating_controller/solorun_data/nn.json @@ -0,0 +1 @@ +../../solorun_data/8core/nn.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/particlefilter.json b/isolating_controller/solorun_data/particlefilter.json new file mode 100644 index 0000000..4c464ab --- /dev/null +++ b/isolating_controller/solorun_data/particlefilter.json @@ -0,0 +1 @@ +../../solorun_data/8core/particlefilter.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/raytrace.json b/isolating_controller/solorun_data/raytrace.json new file mode 100644 index 0000000..0c38ca2 --- /dev/null +++ b/isolating_controller/solorun_data/raytrace.json @@ -0,0 +1 @@ +../../solorun_data/8core/raytrace.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/sp.json b/isolating_controller/solorun_data/sp.json new file mode 100644 index 0000000..6abde0e --- /dev/null +++ b/isolating_controller/solorun_data/sp.json @@ -0,0 +1 @@ +../../solorun_data/8core/sp.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/streamcluster.json b/isolating_controller/solorun_data/streamcluster.json new file mode 100644 index 0000000..7d1df85 --- /dev/null +++ b/isolating_controller/solorun_data/streamcluster.json @@ -0,0 +1 @@ +../../solorun_data/8core/streamcluster.json \ No newline at end of file diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index bf10e65..6396c9e 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -8,6 +8,7 @@ import psutil from .metric_container.basic_metric import BasicMetric, MetricDiff +from .solorun_data.datas import data_map from .utils import DVFS, ResCtrl, numa_topology from .utils.cgroup import Cpu, CpuSet @@ -39,6 +40,9 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv # This variable is used to contain the recent avg. status self._avg_solorun_data: Optional[BasicMetric] = None + if wl_type == 'bg': + self._avg_solorun_data = data_map[name] + self._orig_bound_cores: Tuple[int, ...] = tuple(self._cgroup_cpuset.read_cpus()) self._orig_bound_mems: Set[int] = self._cgroup_cpuset.read_mems() From 32b617bc1c145b57834a95d3b41751b5f2b59ee4 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Thu, 11 Oct 2018 14:44:45 +0900 Subject: [PATCH 49/82] rearrange base_policy.py and swap_iso.py --- controller.py | 11 +- .../isolation/policies/base_policy.py | 128 +++++++++--------- swap_iso.py | 22 +-- 3 files changed, 79 insertions(+), 82 deletions(-) diff --git a/controller.py b/controller.py index f708f24..4c6de82 100755 --- a/controller.py +++ b/controller.py @@ -150,8 +150,6 @@ def __init__(self, pending_queue: PendingQueue) -> None: def _isolate_workloads(self) -> None: logger = logging.getLogger(__name__) - self._swapper.try_swap() - for group, iteration_num in self._isolation_groups.items(): logger.info('') logger.info(f'***************isolation of {group.name} #{iteration_num}***************') @@ -206,6 +204,10 @@ def _isolate_workloads(self) -> None: finally: self._isolation_groups[group] += 1 + if len(tuple(filter(lambda x: x.safe_to_swap, self._isolation_groups.keys()))) < 2: + if self._swapper.swap_is_needed(): + self._swapper.do_swap() + def _register_pending_workloads(self) -> None: """ This function detects and registers the spawned workloads(threads). @@ -241,15 +243,12 @@ def _remove_ended_groups(self) -> None: def run(self) -> None: logger = logging.getLogger(__name__) logger.info('starting isolation loop') - # count = 0 + while True: self._remove_ended_groups() self._register_pending_workloads() time.sleep(self._interval) - # count += 1 - # if self._profile_needed(count): - # self._profile_solorun() self._isolate_workloads() diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index e4fb66b..c731a2c 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -116,63 +116,6 @@ def cur_isolator(self) -> Isolator: def name(self) -> str: return f'{self._fg_wl.name}({self._fg_wl.pid})' - @property - def aggr_inst(self) -> float: - return self._aggr_inst_diff - - @property - def in_solorun_profiling(self) -> bool: - return self._in_solorun_profile - - @property - def most_cont_workload(self) -> Workload: - fg_wl = self.foreground_workload - bg_wl = self.background_workload - - fg_inst_diff = fg_wl.inst_diff - bg_inst_diff = bg_wl.inst_diff - - # FIXME: Below condition is likely to fail due to too little differences between fg and bg - if fg_inst_diff < bg_inst_diff: - return fg_wl - else: - return bg_wl - - @property - def least_cont_workload(self) -> Workload: - fg_wl = self.foreground_workload - bg_wl = self.background_workload - - fg_ipc_diff = fg_wl.inst_diff - bg_ipc_diff = bg_wl.inst_diff - - # FIXME: Below condition is likely to fail due to too little differences between fg and bg - if fg_ipc_diff > bg_ipc_diff: - return fg_wl - else: - return bg_wl - - @property - def least_mem_bw_workload(self) -> Workload: - fg_wl = self.foreground_workload - bg_wl = self.background_workload - - fg_mem_bw = fg_wl.metrics[0].local_mem_ps - bg_mem_bw = bg_wl.metrics[0].local_mem_ps - - if fg_mem_bw > bg_mem_bw: - return bg_wl - else: - return fg_wl - - # FIXME: replace to property - def update_aggr_instr(self) -> None: - fg_diff = self._fg_wl.calc_metric_diff() - bg_diff = self._bg_wl.calc_metric_diff() - self._fg_wl._ipc_diff = fg_diff.instruction_ps - self._bg_wl._ipc_diff = bg_diff.instruction_ps - self._aggr_inst_diff = fg_diff.instruction_ps + bg_diff.instruction_ps - def set_idle_isolator(self) -> None: self._cur_isolator.yield_isolation() self._cur_isolator = IsolationPolicy._IDLE_ISOLATOR @@ -181,6 +124,12 @@ def reset(self) -> None: for isolator in self._isolator_map.values(): isolator.reset() + # Solorun profiling related + + @property + def in_solorun_profiling(self) -> bool: + return self._in_solorun_profile + def start_solorun_profiling(self) -> None: """ profile solorun status of a workload """ if self._in_solorun_profile: @@ -192,13 +141,13 @@ def start_solorun_profiling(self) -> None: self._fg_wl.pause() self._bg_wl.pause() - self._fg_wl.metrics.clear() - # store current configuration for isolator in self._isolator_map.values(): isolator.store_cur_config() isolator.reset() + self._fg_wl.metrics.clear() + self._fg_wl.resume() def stop_solorun_profiling(self) -> None: @@ -231,11 +180,68 @@ def profile_needed(self) -> bool: This function checks if the profiling procedure should be called :return: Decision whether to initiate online solorun profiling """ - # FIXME: or fg doesn't have solorun data - cur_num_threads = self._fg_wl.number_of_threads if self._fg_wl.avg_solorun_data is None or self._cached_fg_num_threads != cur_num_threads: self._cached_fg_num_threads = cur_num_threads return True else: return False + + # Swapper related + + @property + def safe_to_swap(self) -> bool: + return not self._in_solorun_profile and len(self._fg_wl.metrics) > 0 + + @property + def aggr_inst(self) -> float: + return self._aggr_inst_diff + + @property + def most_cont_workload(self) -> Workload: + fg_wl = self.foreground_workload + bg_wl = self.background_workload + + fg_inst_diff = fg_wl.inst_diff + bg_inst_diff = bg_wl.inst_diff + + # FIXME: Below condition is likely to fail due to too little differences between fg and bg + if fg_inst_diff < bg_inst_diff: + return fg_wl + else: + return bg_wl + + @property + def least_cont_workload(self) -> Workload: + fg_wl = self.foreground_workload + bg_wl = self.background_workload + + fg_ipc_diff = fg_wl.inst_diff + bg_ipc_diff = bg_wl.inst_diff + + # FIXME: Below condition is likely to fail due to too little differences between fg and bg + if fg_ipc_diff > bg_ipc_diff: + return fg_wl + else: + return bg_wl + + @property + def least_mem_bw_workload(self) -> Workload: + fg_wl = self.foreground_workload + bg_wl = self.background_workload + + fg_mem_bw = fg_wl.metrics[0].local_mem_ps + bg_mem_bw = bg_wl.metrics[0].local_mem_ps + + if fg_mem_bw > bg_mem_bw: + return bg_wl + else: + return fg_wl + + # FIXME: replace to property + def update_aggr_instr(self) -> None: + fg_diff = self._fg_wl.calc_metric_diff() + bg_diff = self._bg_wl.calc_metric_diff() + self._fg_wl._ipc_diff = fg_diff.instruction_ps + self._bg_wl._ipc_diff = bg_diff.instruction_ps + self._aggr_inst_diff = fg_diff.instruction_ps + bg_diff.instruction_ps diff --git a/swap_iso.py b/swap_iso.py index dbd9e78..8a4e653 100644 --- a/swap_iso.py +++ b/swap_iso.py @@ -39,7 +39,7 @@ def __del__(self): logger = logging.getLogger(__name__) logger.info('SwapIsolator is closed...') - def update_cont_group(self) -> None: + def select_cont_group(self) -> None: """ Most contentious group is the group which shows "the LOWEST aggr. ipc diff" Least contentious group is the group which shows "the HIGHEST aggr. ipc diff" @@ -57,16 +57,17 @@ def update_cont_group(self) -> None: swap_out_grp = group group.update_aggr_instr() - swap_in_grp = max(swap_in_grp, group, key=lambda x: x.aggr_ipc) - swap_out_grp = min(swap_out_grp, group, key=lambda x: x.aggr_ipc) + swap_in_grp = max(swap_in_grp, group, key=lambda x: x.aggr_inst) + swap_out_grp = min(swap_out_grp, group, key=lambda x: x.aggr_inst) self._most_cont_group = swap_out_grp self._least_cont_group = swap_in_grp def swap_is_needed(self) -> bool: + self.select_cont_group() + # FIXME: We used the average ipc diff value (We assume two workloads in a group at most) - avg_min_ipc_diff = self._most_cont_group.aggr_ipc / 2 - print(avg_min_ipc_diff) + avg_min_ipc_diff = self._most_cont_group.aggr_inst / 2 # TODO: Test the _IPC_DIFF_THRESHOLD if avg_min_ipc_diff > self._IPC_DIFF_THRESHOLD: @@ -91,9 +92,9 @@ def swap_is_needed(self) -> bool: return False def do_swap(self) -> None: + # Enable CPUSET memory migration self.pre_swap_setup() - # Enable CPUSET memory migration out_wl = self._most_cont_group.background_workload in_wl = self._least_cont_group.background_workload @@ -133,12 +134,3 @@ def pre_swap_setup(self) -> None: swap_out_workload.cgroup_cpuset.set_memory_migrate(True) swap_in_workload.cgroup_cpuset.set_memory_migrate(True) - - def try_swap(self) -> None: - if len(self._all_groups) < 2: - return - - self.update_cont_group() - - if self.swap_is_needed(): - self.do_swap() From 08c297778bb528fbf420211da54df98e6ff38266 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Thu, 11 Oct 2018 14:43:22 +0900 Subject: [PATCH 50/82] restore offline solorun data and use it for background workload --- isolating_controller/solorun_data/__init__.py | 0 .../solorun_data/canneal.json | 1 + isolating_controller/solorun_data/datas.py | 33 +++++++++++++++++++ .../solorun_data/facesim.json | 1 + .../solorun_data/fluidanimate.json | 1 + .../solorun_data/freqmine.json | 1 + isolating_controller/solorun_data/kmeans.json | 1 + isolating_controller/solorun_data/nn.json | 1 + .../solorun_data/particlefilter.json | 1 + .../solorun_data/raytrace.json | 1 + isolating_controller/solorun_data/sp.json | 1 + .../solorun_data/streamcluster.json | 1 + isolating_controller/workload.py | 4 +++ 13 files changed, 47 insertions(+) create mode 100644 isolating_controller/solorun_data/__init__.py create mode 120000 isolating_controller/solorun_data/canneal.json create mode 100644 isolating_controller/solorun_data/datas.py create mode 120000 isolating_controller/solorun_data/facesim.json create mode 120000 isolating_controller/solorun_data/fluidanimate.json create mode 120000 isolating_controller/solorun_data/freqmine.json create mode 120000 isolating_controller/solorun_data/kmeans.json create mode 120000 isolating_controller/solorun_data/nn.json create mode 120000 isolating_controller/solorun_data/particlefilter.json create mode 120000 isolating_controller/solorun_data/raytrace.json create mode 120000 isolating_controller/solorun_data/sp.json create mode 120000 isolating_controller/solorun_data/streamcluster.json diff --git a/isolating_controller/solorun_data/__init__.py b/isolating_controller/solorun_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/isolating_controller/solorun_data/canneal.json b/isolating_controller/solorun_data/canneal.json new file mode 120000 index 0000000..66f4590 --- /dev/null +++ b/isolating_controller/solorun_data/canneal.json @@ -0,0 +1 @@ +../../solorun_data/8core/canneal.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/datas.py b/isolating_controller/solorun_data/datas.py new file mode 100644 index 0000000..5e5693b --- /dev/null +++ b/isolating_controller/solorun_data/datas.py @@ -0,0 +1,33 @@ +# coding: UTF-8 + +import json +from pathlib import Path +from typing import Dict + +from ..metric_container.basic_metric import BasicMetric + +data_map: Dict[str, BasicMetric] = dict() + + +def _init() -> None: + for data in Path(__file__).parent.iterdir(): # type: Path + if data.match('*.json'): + metric = json.loads(data.read_text()) + + item = BasicMetric(metric['l2miss'], + metric['l3miss'], + metric['instructions'], + metric['cycles'], + metric['stall_cycles'], + metric['wall_cycles'], + metric['intra_coh'], + metric['inter_coh'], + metric['llc_size'], + metric['local_mem'], + metric['remote_mem'], + 1000) + + data_map[metric['name']] = item + + +_init() diff --git a/isolating_controller/solorun_data/facesim.json b/isolating_controller/solorun_data/facesim.json new file mode 120000 index 0000000..0239831 --- /dev/null +++ b/isolating_controller/solorun_data/facesim.json @@ -0,0 +1 @@ +../../solorun_data/8core/facesim.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/fluidanimate.json b/isolating_controller/solorun_data/fluidanimate.json new file mode 120000 index 0000000..0802176 --- /dev/null +++ b/isolating_controller/solorun_data/fluidanimate.json @@ -0,0 +1 @@ +../../solorun_data/8core/fluidanimate.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/freqmine.json b/isolating_controller/solorun_data/freqmine.json new file mode 120000 index 0000000..6fd79ad --- /dev/null +++ b/isolating_controller/solorun_data/freqmine.json @@ -0,0 +1 @@ +../../solorun_data/8core/freqmine.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/kmeans.json b/isolating_controller/solorun_data/kmeans.json new file mode 120000 index 0000000..d54f4be --- /dev/null +++ b/isolating_controller/solorun_data/kmeans.json @@ -0,0 +1 @@ +../../solorun_data/8core/kmeans.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/nn.json b/isolating_controller/solorun_data/nn.json new file mode 120000 index 0000000..843df3a --- /dev/null +++ b/isolating_controller/solorun_data/nn.json @@ -0,0 +1 @@ +../../solorun_data/8core/nn.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/particlefilter.json b/isolating_controller/solorun_data/particlefilter.json new file mode 120000 index 0000000..4c464ab --- /dev/null +++ b/isolating_controller/solorun_data/particlefilter.json @@ -0,0 +1 @@ +../../solorun_data/8core/particlefilter.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/raytrace.json b/isolating_controller/solorun_data/raytrace.json new file mode 120000 index 0000000..0c38ca2 --- /dev/null +++ b/isolating_controller/solorun_data/raytrace.json @@ -0,0 +1 @@ +../../solorun_data/8core/raytrace.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/sp.json b/isolating_controller/solorun_data/sp.json new file mode 120000 index 0000000..6abde0e --- /dev/null +++ b/isolating_controller/solorun_data/sp.json @@ -0,0 +1 @@ +../../solorun_data/8core/sp.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/streamcluster.json b/isolating_controller/solorun_data/streamcluster.json new file mode 120000 index 0000000..7d1df85 --- /dev/null +++ b/isolating_controller/solorun_data/streamcluster.json @@ -0,0 +1 @@ +../../solorun_data/8core/streamcluster.json \ No newline at end of file diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index bf10e65..6396c9e 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -8,6 +8,7 @@ import psutil from .metric_container.basic_metric import BasicMetric, MetricDiff +from .solorun_data.datas import data_map from .utils import DVFS, ResCtrl, numa_topology from .utils.cgroup import Cpu, CpuSet @@ -39,6 +40,9 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv # This variable is used to contain the recent avg. status self._avg_solorun_data: Optional[BasicMetric] = None + if wl_type == 'bg': + self._avg_solorun_data = data_map[name] + self._orig_bound_cores: Tuple[int, ...] = tuple(self._cgroup_cpuset.read_cpus()) self._orig_bound_mems: Set[int] = self._cgroup_cpuset.read_mems() From 56feb9dc36e7fbd2703d2810c4715c5a56c66587 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 12 Oct 2018 01:36:47 +0900 Subject: [PATCH 51/82] implement affinity isolator (temporal name) --- .../isolation/isolators/affinity.py | 96 +++++++++++++++++++ .../isolation/isolators/schedule.py | 4 +- .../isolation/policies/base_policy.py | 18 ++-- .../isolation/policies/greedy_diff_policy.py | 27 +++--- .../greedy_diff_with_violation_policy.py | 7 +- 5 files changed, 122 insertions(+), 30 deletions(-) create mode 100644 isolating_controller/isolation/isolators/affinity.py diff --git a/isolating_controller/isolation/isolators/affinity.py b/isolating_controller/isolation/isolators/affinity.py new file mode 100644 index 0000000..11c41ae --- /dev/null +++ b/isolating_controller/isolation/isolators/affinity.py @@ -0,0 +1,96 @@ +# coding: UTF-8 + +import logging +from typing import Optional + +from isolating_controller.workload import Workload +from .base_isolator import Isolator +from .. import NextStep +from ...metric_container.basic_metric import MetricDiff + + +class AffinityIsolator(Isolator): + _DOD_THRESHOLD = 0.005 + _FORCE_THRESHOLD = 0.1 + + def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: + super().__init__(foreground_wl, background_wl) + + self._cur_step: int = self._foreground_wl.orig_bound_cores[-1] + + self._stored_config: Optional[int] = None + + def strengthen(self) -> 'AffinityIsolator': + self._cur_step += 1 + return self + + @property + def is_max_level(self) -> bool: + # FIXME: hard coded + return self._cur_step + 1 == self._background_wl.bound_cores[0] + + @property + def is_min_level(self) -> bool: + return self._foreground_wl.orig_bound_cores == self._foreground_wl.bound_cores + + def weaken(self) -> 'AffinityIsolator': + self._cur_step -= 1 + return self + + def enforce(self) -> None: + logger = logging.getLogger(__name__) + logger.info(f'affinity of foreground is {self._foreground_wl.orig_bound_cores[0]}-{self._cur_step}') + + self._foreground_wl.bound_cores = range(self._foreground_wl.orig_bound_cores[0], self._cur_step + 1) + + def _first_decision(self, metric_diff: MetricDiff) -> NextStep: + curr_diff = metric_diff.instruction_ps + + logger = logging.getLogger(__name__) + logger.debug(f'current diff: {curr_diff:>7.4f}') + + if curr_diff < 0: + if self.is_max_level: + return NextStep.STOP + else: + return NextStep.STRENGTHEN + elif curr_diff <= AffinityIsolator._FORCE_THRESHOLD: + return NextStep.STOP + else: + if self.is_min_level: + return NextStep.STOP + else: + return NextStep.WEAKEN + + def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: + curr_diff = cur_metric_diff.instruction_ps + prev_diff = prev_metric_diff.instruction_ps + diff_of_diff = curr_diff - prev_diff + + logger = logging.getLogger(__name__) + logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') + logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') + + if self.is_min_level or self.is_max_level \ + or abs(diff_of_diff) <= AffinityIsolator._DOD_THRESHOLD \ + or abs(curr_diff) <= AffinityIsolator._DOD_THRESHOLD: + return NextStep.STOP + + elif curr_diff > 0: + return NextStep.WEAKEN + + else: + return NextStep.STRENGTHEN + + def reset(self) -> None: + if self._foreground_wl.is_running: + self._foreground_wl.bound_cores = self._foreground_wl.orig_bound_cores + + def store_cur_config(self) -> None: + self._stored_config = self._cur_step + + def load_cur_config(self) -> None: + super().load_cur_config() + + self._cur_step = self._stored_config + self._stored_config = None diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index d0bd76f..12a5904 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -31,13 +31,12 @@ def weaken(self) -> 'SchedIsolator': @property def is_max_level(self) -> bool: - # FIXME: hard coded return self._cur_step == self._background_wl.orig_bound_cores[-1] @property def is_min_level(self) -> bool: # FIXME: hard coded - return self._cur_step == self._background_wl.orig_bound_cores[0] + return self._cur_step - 1 == self._foreground_wl.bound_cores[-1] def enforce(self) -> None: logger = logging.getLogger(__name__) @@ -75,7 +74,6 @@ def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: Metr logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - # FIXME: hard coded if self.is_min_level or self.is_max_level \ or abs(diff_of_diff) <= SchedIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= SchedIsolator._DOD_THRESHOLD: diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index e4fb66b..6ea1570 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -5,7 +5,8 @@ from typing import Dict, Type from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, Isolator, MemoryIsolator +from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, SchedIsolator +from ..isolators.affinity import AffinityIsolator from ...metric_container.basic_metric import BasicMetric, MetricDiff from ...workload import Workload @@ -22,7 +23,8 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._isolator_map: Dict[Type[Isolator], Isolator] = dict(( (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), - (CoreIsolator, CoreIsolator(self._fg_wl, self._bg_wl)), + (SchedIsolator, SchedIsolator(self._fg_wl, self._bg_wl)), + (AffinityIsolator, AffinityIsolator(self._fg_wl, self._bg_wl)), )) self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR @@ -56,14 +58,11 @@ def contentious_resource(self) -> ResourceType: cur_metric: BasicMetric = self._fg_wl.metrics[0] logger = logging.getLogger(__name__) - logger.info(repr(metric_diff)) + logger.info(f'foreground : {metric_diff}') + logger.info(f'background : {self._bg_wl.calc_metric_diff()}') logger.info(f'l3_int: {cur_metric.l3_intensity:>7.04f}, ' f'mem_int: {cur_metric.mem_intensity:>7.04f}, ' f'l3_util: {cur_metric.l3_util:>7.04f}') - if abs(cur_metric.l3_intensity) < IsolationPolicy._CPU_THRESHOLD \ - and abs(cur_metric.mem_intensity) < IsolationPolicy._CPU_THRESHOLD: - return ResourceType.CPU - if metric_diff.local_mem_util_ps > 0 and metric_diff.l3_hit_ratio > 0: if metric_diff.l3_hit_ratio > metric_diff.local_mem_util_ps: return ResourceType.CACHE @@ -232,9 +231,12 @@ def profile_needed(self) -> bool: :return: Decision whether to initiate online solorun profiling """ # FIXME: or fg doesn't have solorun data + logger = logging.getLogger(__name__) cur_num_threads = self._fg_wl.number_of_threads - if self._fg_wl.avg_solorun_data is None or self._cached_fg_num_threads != cur_num_threads: + if self._fg_wl.avg_solorun_data is None \ + or cur_num_threads is not 0 and self._cached_fg_num_threads != cur_num_threads: + logger.debug(f'number of threads. cached: {self._cached_fg_num_threads}, current : {cur_num_threads}') self._cached_fg_num_threads = cur_num_threads return True else: diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 8a89c0f..6b9755d 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -2,9 +2,10 @@ import logging +from isolating_controller.isolation.isolators.affinity import AffinityIsolator from .base_policy import IsolationPolicy from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload @@ -22,34 +23,30 @@ def choose_next_isolator(self) -> bool: logger = logging.getLogger(__name__) logger.debug('looking for new isolation...') - resource: ResourceType = self.contentious_resource() + # if foreground is web server (CPU critical) + if len(self._fg_wl.bound_cores) < self._fg_wl.number_of_threads: + if AffinityIsolator in self._isolator_map and not self._isolator_map[AffinityIsolator].is_max_level: + self._cur_isolator = self._isolator_map[AffinityIsolator] + logger.info(f'AffinityIsolator') + return True - if resource is ResourceType.CPU: - self._cur_isolator = self._isolator_map[CoreIsolator] - self._cur_isolator._contentious_resource = ResourceType.CPU - # logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.CPU.name}s') - logger.info(f'Resource Type: {ResourceType.CPU.name}, CoreIsolation') - return True + resource: ResourceType = self.contentious_resource() - elif resource is ResourceType.CACHE: + if resource is ResourceType.CACHE: self._cur_isolator = self._isolator_map[CacheIsolator] - # logger.info(f'Cache Isolation for {self._fg_wl} is started') logger.info(f'Resource Type: {ResourceType.CACHE.name}, CacheIsolation') return True elif not self._is_mem_isolated and resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[MemoryIsolator] self._is_mem_isolated = True - # logger.info(f'Memory Bandwidth Isolation for {self._fg_wl} is started') logger.info(f'Resource Type: {ResourceType.MEMORY.name}, MemoryIsolation') return True elif resource is ResourceType.MEMORY: - self._cur_isolator = self._isolator_map[CoreIsolator] - self._cur_isolator._contentious_resource = ResourceType.MEMORY + self._cur_isolator = self._isolator_map[SchedIsolator] self._is_mem_isolated = False - # logger.info(f'Core Isolation for {self._fg_wl} is started to isolate {ResourceType.MEMORY.name} BW') - logger.info(f'Resource Type: {ResourceType.MEMORY.name}, CoreIsolation') + logger.info(f'Resource Type: {ResourceType.MEMORY.name}, SchedIsolation') return True else: diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 0a1b623..057dcfd 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -4,7 +4,7 @@ from .greedy_diff_policy import GreedyDiffPolicy from .. import ResourceType -from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator +from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload @@ -20,10 +20,9 @@ def _check_violation(self) -> bool: resource: ResourceType = self.contentious_resource() return \ - resource is ResourceType.CPU and not isinstance(self._cur_isolator, CoreIsolator) \ - or resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ + resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, CoreIsolator)) + and not isinstance(self._cur_isolator, SchedIsolator)) @property def new_isolator_needed(self) -> bool: From bf37f2929f4b7d1f7a4ea67eb22fc1ebb26d5488 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Fri, 12 Oct 2018 02:49:13 +0900 Subject: [PATCH 52/82] cleanup controller.py --- controller.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/controller.py b/controller.py index f708f24..32390da 100755 --- a/controller.py +++ b/controller.py @@ -241,15 +241,12 @@ def _remove_ended_groups(self) -> None: def run(self) -> None: logger = logging.getLogger(__name__) logger.info('starting isolation loop') - # count = 0 + while True: self._remove_ended_groups() self._register_pending_workloads() time.sleep(self._interval) - # count += 1 - # if self._profile_needed(count): - # self._profile_solorun() self._isolate_workloads() From d5ffc978c6b566eeb1bcf34bccab174f20998d82 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 14:46:16 +0900 Subject: [PATCH 53/82] fix the non-inclusive criteria for isolators --- isolating_controller/isolation/isolators/cache.py | 15 ++++++++++----- isolating_controller/isolation/isolators/core.py | 13 +++++++++---- .../isolation/isolators/memory.py | 15 ++++++++++----- .../isolation/isolators/schedule.py | 13 +++++++++---- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index 9c6ed8a..d1505ef 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -51,7 +51,7 @@ def is_max_level(self) -> bool: @property def is_min_level(self) -> bool: # FIXME: hard coded - return self._cur_step is None or self._cur_step - ResCtrl.STEP <= ResCtrl.MIN_BITS + return self._cur_step is None or self._cur_step - ResCtrl.STEP < ResCtrl.MIN_BITS def enforce(self) -> None: logger = logging.getLogger(__name__) @@ -102,16 +102,21 @@ def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: Metr logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - if self.is_min_level or self.is_max_level \ - or abs(diff_of_diff) <= CacheIsolator._DOD_THRESHOLD \ + if abs(diff_of_diff) <= CacheIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= CacheIsolator._DOD_THRESHOLD: return NextStep.STOP elif curr_diff > 0: - return NextStep.WEAKEN + if self.is_min_level: + return NextStep.STOP + else: + return NextStep.WEAKEN else: - return NextStep.STRENGTHEN + if self.is_max_level: + return NextStep.STOP + else: + return NextStep.STRENGTHEN def reset(self) -> None: masks = [ResCtrl.MIN_MASK] * (max(numa_topology.cur_online_nodes()) + 1) diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index ecce49a..fe023df 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -124,18 +124,23 @@ def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: Metr logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') # Case1 : diff is too small to perform isolation - if self.is_max_level or self.is_min_level \ - or abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ + if abs(diff_of_diff) <= CoreIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= CoreIsolator._DOD_THRESHOLD: return NextStep.STOP # Case2 : FG shows lower contention than solo-run -> Slower FG or Faster BG elif curr_diff > 0: - return self._weaken_condition(cur_metric_diff.instruction_ps) + if self.is_min_level: + return NextStep.STOP + else: + return self._weaken_condition(cur_metric_diff.instruction_ps) # Case3 : FG shows higher contention than solo-run else: - return self._strengthen_condition(cur_metric_diff.instruction_ps) + if self.is_max_level: + return NextStep.STOP + else: + return self._strengthen_condition(cur_metric_diff.instruction_ps) def _weaken_condition(self, fg_instruction_ps: float) -> NextStep: # BG Next Step Decision diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index a4770c4..b7abe0e 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -37,7 +37,7 @@ def is_max_level(self) -> bool: @property def is_min_level(self) -> bool: # FIXME: hard coded - return DVFS.MAX <= self._cur_step + DVFS.STEP + return DVFS.MAX < self._cur_step + DVFS.STEP def enforce(self) -> None: logger = logging.getLogger(__name__) @@ -73,16 +73,21 @@ def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: Metr logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - if self.is_min_level or self.is_max_level \ - or abs(diff_of_diff) <= MemoryIsolator._DOD_THRESHOLD \ + if abs(diff_of_diff) <= MemoryIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= MemoryIsolator._DOD_THRESHOLD: return NextStep.STOP elif curr_diff > 0: - return NextStep.WEAKEN + if self.is_min_level: + return NextStep.STOP + else: + return NextStep.WEAKEN else: - return NextStep.STRENGTHEN + if self.is_max_level: + return NextStep.STOP + else: + return NextStep.STRENGTHEN def reset(self) -> None: DVFS.set_freq(DVFS.MAX, self._background_wl.orig_bound_cores) diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index 12a5904..6c9aac2 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -74,16 +74,21 @@ def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: Metr logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - if self.is_min_level or self.is_max_level \ - or abs(diff_of_diff) <= SchedIsolator._DOD_THRESHOLD \ + if abs(diff_of_diff) <= SchedIsolator._DOD_THRESHOLD \ or abs(curr_diff) <= SchedIsolator._DOD_THRESHOLD: return NextStep.STOP elif curr_diff > 0: - return NextStep.WEAKEN + if self.is_min_level: + return NextStep.STOP + else: + return NextStep.WEAKEN else: - return NextStep.STRENGTHEN + if self.is_max_level: + return NextStep.STOP + else: + return NextStep.STRENGTHEN def reset(self) -> None: if self._background_wl.is_running: From 13edbb4b5e595925bf3c38779b94a103e8159942 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 15:44:16 +0900 Subject: [PATCH 54/82] cleanup log statements --- isolating_controller/isolation/policies/base_policy.py | 9 +++------ .../isolation/policies/greedy_diff_policy.py | 8 ++++---- isolating_controller/workload.py | 5 ----- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base_policy.py index 6ea1570..8327d3d 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base_policy.py @@ -22,9 +22,9 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._isolator_map: Dict[Type[Isolator], Isolator] = dict(( (CacheIsolator, CacheIsolator(self._fg_wl, self._bg_wl)), - (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), - (SchedIsolator, SchedIsolator(self._fg_wl, self._bg_wl)), (AffinityIsolator, AffinityIsolator(self._fg_wl, self._bg_wl)), + (SchedIsolator, SchedIsolator(self._fg_wl, self._bg_wl)), + (MemoryIsolator, MemoryIsolator(self._fg_wl, self._bg_wl)), )) self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR @@ -55,14 +55,11 @@ def choose_next_isolator(self) -> bool: def contentious_resource(self) -> ResourceType: metric_diff: MetricDiff = self._fg_wl.calc_metric_diff() - cur_metric: BasicMetric = self._fg_wl.metrics[0] logger = logging.getLogger(__name__) logger.info(f'foreground : {metric_diff}') logger.info(f'background : {self._bg_wl.calc_metric_diff()}') - logger.info(f'l3_int: {cur_metric.l3_intensity:>7.04f}, ' - f'mem_int: {cur_metric.mem_intensity:>7.04f}, ' - f'l3_util: {cur_metric.l3_util:>7.04f}') + if metric_diff.local_mem_util_ps > 0 and metric_diff.l3_hit_ratio > 0: if metric_diff.l3_hit_ratio > metric_diff.local_mem_util_ps: return ResourceType.CACHE diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 6b9755d..6fe20f4 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -27,26 +27,26 @@ def choose_next_isolator(self) -> bool: if len(self._fg_wl.bound_cores) < self._fg_wl.number_of_threads: if AffinityIsolator in self._isolator_map and not self._isolator_map[AffinityIsolator].is_max_level: self._cur_isolator = self._isolator_map[AffinityIsolator] - logger.info(f'AffinityIsolator') + logger.info(f'Starting {self._cur_isolator.__class__.__name__}...') return True resource: ResourceType = self.contentious_resource() if resource is ResourceType.CACHE: self._cur_isolator = self._isolator_map[CacheIsolator] - logger.info(f'Resource Type: {ResourceType.CACHE.name}, CacheIsolation') + logger.info(f'Starting {self._cur_isolator.__class__.__name__}...') return True elif not self._is_mem_isolated and resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[MemoryIsolator] self._is_mem_isolated = True - logger.info(f'Resource Type: {ResourceType.MEMORY.name}, MemoryIsolation') + logger.info(f'Starting {self._cur_isolator.__class__.__name__}...') return True elif resource is ResourceType.MEMORY: self._cur_isolator = self._isolator_map[SchedIsolator] self._is_mem_isolated = False - logger.info(f'Resource Type: {ResourceType.MEMORY.name}, SchedIsolation') + logger.info(f'Starting {self._cur_isolator.__class__.__name__}...') return True else: diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 6396c9e..3567272 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -148,12 +148,7 @@ def avg_solorun_data(self, new_data: BasicMetric) -> None: self._avg_solorun_data = new_data def calc_metric_diff(self) -> MetricDiff: - logger = logging.getLogger(__name__) - curr_metric: BasicMetric = self._metrics[0] - logger.debug(f'solorun_data L3 hit ratio: {self._avg_solorun_data.l3hit_ratio}, ' - f'Local Mem BW ps : {self._avg_solorun_data.local_mem_ps}, ' - f'Instruction ps. : {self._avg_solorun_data.instruction_ps}') return MetricDiff(curr_metric, self._avg_solorun_data) def all_child_tid(self) -> Tuple[int, ...]: From 8eb587fb6a59fc41ce895aa6109073c288bdb910 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 15:49:13 +0900 Subject: [PATCH 55/82] fix criteria of AffinityIsolator --- .../isolation/policies/greedy_diff_policy.py | 2 +- .../policies/greedy_diff_with_violation_policy.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 6fe20f4..87363eb 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -24,7 +24,7 @@ def choose_next_isolator(self) -> bool: logger.debug('looking for new isolation...') # if foreground is web server (CPU critical) - if len(self._fg_wl.bound_cores) < self._fg_wl.number_of_threads: + if len(self._fg_wl.bound_cores) * 2 < self._fg_wl.number_of_threads: if AffinityIsolator in self._isolator_map and not self._isolator_map[AffinityIsolator].is_max_level: self._cur_isolator = self._isolator_map[AffinityIsolator] logger.info(f'Starting {self._cur_isolator.__class__.__name__}...') diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 057dcfd..9de70b7 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -17,12 +17,15 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._violation_count: int = 0 def _check_violation(self) -> bool: + if isinstance(self._cur_isolator, AffinityIsolator): + return False + resource: ResourceType = self.contentious_resource() return \ resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ - or resource is ResourceType.MEMORY and (not isinstance(self._cur_isolator, MemoryIsolator) - and not isinstance(self._cur_isolator, SchedIsolator)) + or resource is ResourceType.MEMORY and not (isinstance(self._cur_isolator, MemoryIsolator) + or isinstance(self._cur_isolator, SchedIsolator)) @property def new_isolator_needed(self) -> bool: From d6fb42ef0827f9480df5d13f2b6214c53fce9527 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 15:58:45 +0900 Subject: [PATCH 56/82] handle workloads that solorun profiling when deleting a group --- controller.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/controller.py b/controller.py index 32390da..a849ed9 100755 --- a/controller.py +++ b/controller.py @@ -237,6 +237,9 @@ def _remove_ended_groups(self) -> None: # remove from containers group.reset() del self._isolation_groups[group] + if group.in_solorun_profiling: + group.stop_solorun_profiling() + del self._solorun_count[group] def run(self) -> None: logger = logging.getLogger(__name__) From 7a9c0662efad20503d7ce28c60b3fcf3373ebdbc Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 15:59:13 +0900 Subject: [PATCH 57/82] forward referencing AffinityIsolator --- isolating_controller/isolation/isolators/__init__.py | 1 + isolating_controller/isolation/policies/greedy_diff_policy.py | 3 +-- .../isolation/policies/greedy_diff_with_violation_policy.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/isolating_controller/isolation/isolators/__init__.py b/isolating_controller/isolation/isolators/__init__.py index 6bd83a0..7f25b66 100644 --- a/isolating_controller/isolation/isolators/__init__.py +++ b/isolating_controller/isolation/isolators/__init__.py @@ -1,6 +1,7 @@ # coding: UTF-8 +from .affinity import AffinityIsolator from .base_isolator import Isolator from .cache import CacheIsolator from .core import CoreIsolator diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy_diff_policy.py index 87363eb..7f266ff 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_policy.py @@ -2,10 +2,9 @@ import logging -from isolating_controller.isolation.isolators.affinity import AffinityIsolator from .base_policy import IsolationPolicy from .. import ResourceType -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import AffinityIsolator, CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py index 9de70b7..e94eed4 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py @@ -4,7 +4,7 @@ from .greedy_diff_policy import GreedyDiffPolicy from .. import ResourceType -from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ..isolators import AffinityIsolator, CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload From d27f04c38352ad37d5d919c265c44181c31a1472 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 16:15:29 +0900 Subject: [PATCH 58/82] rename filename of policies and BaseIsolator --- controller.py | 4 ++-- isolating_controller/isolation/isolators/__init__.py | 2 +- isolating_controller/isolation/isolators/affinity.py | 2 +- .../isolators/{base_isolator.py => base.py} | 0 isolating_controller/isolation/isolators/cache.py | 2 +- isolating_controller/isolation/isolators/core.py | 2 +- isolating_controller/isolation/isolators/idle.py | 2 +- isolating_controller/isolation/isolators/memory.py | 2 +- isolating_controller/isolation/isolators/schedule.py | 2 +- isolating_controller/isolation/policies/__init__.py | 12 ++++++------ .../isolation/policies/{base_policy.py => base.py} | 3 --- .../policies/{diff_policy.py => defensive.py} | 4 ++-- .../{diff_policy_cpu.py => defensive_cpu.py} | 4 ++-- ...olation_policy.py => defensive_with_violation.py} | 6 +++--- .../policies/{greedy_diff_policy.py => greedy.py} | 4 ++-- ..._violation_policy.py => greedy_with_violation.py} | 6 +++--- swap_iso.py | 2 +- 17 files changed, 28 insertions(+), 31 deletions(-) rename isolating_controller/isolation/isolators/{base_isolator.py => base.py} (100%) rename isolating_controller/isolation/policies/{base_policy.py => base.py} (98%) rename isolating_controller/isolation/policies/{diff_policy.py => defensive.py} (96%) rename isolating_controller/isolation/policies/{diff_policy_cpu.py => defensive_cpu.py} (96%) rename isolating_controller/isolation/policies/{diff_with_violation_policy.py => defensive_with_violation.py} (89%) rename isolating_controller/isolation/policies/{greedy_diff_policy.py => greedy.py} (95%) rename isolating_controller/isolation/policies/{greedy_diff_with_violation_policy.py => greedy_with_violation.py} (89%) diff --git a/controller.py b/controller.py index a849ed9..3844ce2 100755 --- a/controller.py +++ b/controller.py @@ -22,7 +22,7 @@ import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator -from isolating_controller.isolation.policies import GreedyDiffWViolationPolicy, IsolationPolicy +from isolating_controller.isolation.policies import GreedyWViolationPolicy, IsolationPolicy from isolating_controller.metric_container.basic_metric import BasicMetric from isolating_controller.workload import Workload from pending_queue import PendingQueue @@ -47,7 +47,7 @@ def __init__(self, metric_buf_size: int) -> None: self._rmq_host = 'localhost' self._rmq_creation_queue = 'workload_creation' - self._pending_wl = PendingQueue(GreedyDiffWViolationPolicy) + self._pending_wl = PendingQueue(GreedyWViolationPolicy) self._control_thread = ControlThread(self._pending_wl) def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: diff --git a/isolating_controller/isolation/isolators/__init__.py b/isolating_controller/isolation/isolators/__init__.py index 7f25b66..6ec23c9 100644 --- a/isolating_controller/isolation/isolators/__init__.py +++ b/isolating_controller/isolation/isolators/__init__.py @@ -2,7 +2,7 @@ from .affinity import AffinityIsolator -from .base_isolator import Isolator +from .base import Isolator from .cache import CacheIsolator from .core import CoreIsolator from .idle import IdleIsolator diff --git a/isolating_controller/isolation/isolators/affinity.py b/isolating_controller/isolation/isolators/affinity.py index 11c41ae..3a22c90 100644 --- a/isolating_controller/isolation/isolators/affinity.py +++ b/isolating_controller/isolation/isolators/affinity.py @@ -4,7 +4,7 @@ from typing import Optional from isolating_controller.workload import Workload -from .base_isolator import Isolator +from .base import Isolator from .. import NextStep from ...metric_container.basic_metric import MetricDiff diff --git a/isolating_controller/isolation/isolators/base_isolator.py b/isolating_controller/isolation/isolators/base.py similarity index 100% rename from isolating_controller/isolation/isolators/base_isolator.py rename to isolating_controller/isolation/isolators/base.py diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index d1505ef..be0b54b 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -3,7 +3,7 @@ import logging from typing import Optional, Tuple -from .base_isolator import Isolator +from .base import Isolator from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...utils import ResCtrl, numa_topology diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index fe023df..09acc7f 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -3,7 +3,7 @@ import logging from typing import Optional, Tuple -from .base_isolator import Isolator +from .base import Isolator from .. import NextStep, ResourceType from ...metric_container.basic_metric import MetricDiff from ...workload import Workload diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index 5b3e24d..462657d 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -1,6 +1,6 @@ # coding: UTF-8 -from .base_isolator import Isolator +from .base import Isolator from .. import NextStep diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index b7abe0e..e48e6f6 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -3,7 +3,7 @@ import logging from typing import Optional -from .base_isolator import Isolator +from .base import Isolator from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...utils import DVFS diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index 6c9aac2..616b6e6 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -3,7 +3,7 @@ import logging from typing import Optional -from .base_isolator import Isolator +from .base import Isolator from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...workload import Workload diff --git a/isolating_controller/isolation/policies/__init__.py b/isolating_controller/isolation/policies/__init__.py index 5e517fa..6861774 100644 --- a/isolating_controller/isolation/policies/__init__.py +++ b/isolating_controller/isolation/policies/__init__.py @@ -1,8 +1,8 @@ # coding: UTF-8 -from .base_policy import IsolationPolicy -from .diff_policy import DiffPolicy -from .diff_policy_cpu import DiffCPUPolicy -from .diff_with_violation_policy import DiffWViolationPolicy -from .greedy_diff_policy import GreedyDiffPolicy -from .greedy_diff_with_violation_policy import GreedyDiffWViolationPolicy +from .base import IsolationPolicy +from .defensive import DefensivePolicy +from .defensive_cpu import DefensiveCPUPolicy +from .defensive_with_violation import DefensiveWViolationPolicy +from .greedy import GreedyPolicy +from .greedy_with_violation import GreedyWViolationPolicy diff --git a/isolating_controller/isolation/policies/base_policy.py b/isolating_controller/isolation/policies/base.py similarity index 98% rename from isolating_controller/isolation/policies/base_policy.py rename to isolating_controller/isolation/policies/base.py index 8327d3d..b70d6a8 100644 --- a/isolating_controller/isolation/policies/base_policy.py +++ b/isolating_controller/isolation/policies/base.py @@ -13,8 +13,6 @@ class IsolationPolicy(metaclass=ABCMeta): _IDLE_ISOLATOR: IdleIsolator = IdleIsolator() - # FIXME : _CPU_THRESHOLD needs test - _CPU_THRESHOLD = 0.1 def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._fg_wl = fg_wl @@ -227,7 +225,6 @@ def profile_needed(self) -> bool: This function checks if the profiling procedure should be called :return: Decision whether to initiate online solorun profiling """ - # FIXME: or fg doesn't have solorun data logger = logging.getLogger(__name__) cur_num_threads = self._fg_wl.number_of_threads diff --git a/isolating_controller/isolation/policies/diff_policy.py b/isolating_controller/isolation/policies/defensive.py similarity index 96% rename from isolating_controller/isolation/policies/diff_policy.py rename to isolating_controller/isolation/policies/defensive.py index f9285b3..1388391 100644 --- a/isolating_controller/isolation/policies/diff_policy.py +++ b/isolating_controller/isolation/policies/defensive.py @@ -2,13 +2,13 @@ import logging -from .base_policy import IsolationPolicy +from .base import IsolationPolicy from .. import ResourceType from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload -class DiffPolicy(IsolationPolicy): +class DefensivePolicy(IsolationPolicy): def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: super().__init__(fg_wl, bg_wl) diff --git a/isolating_controller/isolation/policies/diff_policy_cpu.py b/isolating_controller/isolation/policies/defensive_cpu.py similarity index 96% rename from isolating_controller/isolation/policies/diff_policy_cpu.py rename to isolating_controller/isolation/policies/defensive_cpu.py index 209a00e..f413d45 100644 --- a/isolating_controller/isolation/policies/diff_policy_cpu.py +++ b/isolating_controller/isolation/policies/defensive_cpu.py @@ -2,13 +2,13 @@ import logging -from .base_policy import IsolationPolicy +from .base import IsolationPolicy from .. import ResourceType from ..isolators import CacheIsolator, CoreIsolator, IdleIsolator, MemoryIsolator from ...workload import Workload -class DiffCPUPolicy(IsolationPolicy): +class DefensiveCPUPolicy(IsolationPolicy): def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: super().__init__(fg_wl, bg_wl) diff --git a/isolating_controller/isolation/policies/diff_with_violation_policy.py b/isolating_controller/isolation/policies/defensive_with_violation.py similarity index 89% rename from isolating_controller/isolation/policies/diff_with_violation_policy.py rename to isolating_controller/isolation/policies/defensive_with_violation.py index 4b264c7..6fae5ac 100644 --- a/isolating_controller/isolation/policies/diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/defensive_with_violation.py @@ -2,13 +2,13 @@ import logging -from .diff_policy import DiffPolicy +from .defensive import DefensivePolicy from .. import ResourceType from ..isolators import CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload -class DiffWViolationPolicy(DiffPolicy): +class DefensiveWViolationPolicy(DefensivePolicy): VIOLATION_THRESHOLD = 3 def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: @@ -35,7 +35,7 @@ def new_isolator_needed(self) -> bool: self._violation_count += 1 - if self._violation_count >= DiffWViolationPolicy.VIOLATION_THRESHOLD: + if self._violation_count >= DefensiveWViolationPolicy.VIOLATION_THRESHOLD: logger.info('new isolator is required due to violation') self.set_idle_isolator() self._clear_flags() diff --git a/isolating_controller/isolation/policies/greedy_diff_policy.py b/isolating_controller/isolation/policies/greedy.py similarity index 95% rename from isolating_controller/isolation/policies/greedy_diff_policy.py rename to isolating_controller/isolation/policies/greedy.py index 7f266ff..d04f68f 100644 --- a/isolating_controller/isolation/policies/greedy_diff_policy.py +++ b/isolating_controller/isolation/policies/greedy.py @@ -2,13 +2,13 @@ import logging -from .base_policy import IsolationPolicy +from .base import IsolationPolicy from .. import ResourceType from ..isolators import AffinityIsolator, CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload -class GreedyDiffPolicy(IsolationPolicy): +class GreedyPolicy(IsolationPolicy): def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: super().__init__(fg_wl, bg_wl) diff --git a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py b/isolating_controller/isolation/policies/greedy_with_violation.py similarity index 89% rename from isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py rename to isolating_controller/isolation/policies/greedy_with_violation.py index e94eed4..d8cc79d 100644 --- a/isolating_controller/isolation/policies/greedy_diff_with_violation_policy.py +++ b/isolating_controller/isolation/policies/greedy_with_violation.py @@ -2,13 +2,13 @@ import logging -from .greedy_diff_policy import GreedyDiffPolicy +from .greedy import GreedyPolicy from .. import ResourceType from ..isolators import AffinityIsolator, CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator from ...workload import Workload -class GreedyDiffWViolationPolicy(GreedyDiffPolicy): +class GreedyWViolationPolicy(GreedyPolicy): VIOLATION_THRESHOLD = 3 def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: @@ -38,7 +38,7 @@ def new_isolator_needed(self) -> bool: self._violation_count += 1 - if self._violation_count >= GreedyDiffWViolationPolicy.VIOLATION_THRESHOLD: + if self._violation_count >= GreedyWViolationPolicy.VIOLATION_THRESHOLD: logger.info('new isolator is required due to violation') self.set_idle_isolator() self._violation_count = 0 diff --git a/swap_iso.py b/swap_iso.py index dbd9e78..f9f740b 100644 --- a/swap_iso.py +++ b/swap_iso.py @@ -7,7 +7,7 @@ import psutil -from isolating_controller.isolation.policies.base_policy import IsolationPolicy +from isolating_controller.isolation.policies.base import IsolationPolicy from isolating_controller.workload import Workload From 75e0d5d184a6760094660c9b54a19f994930ce3b Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 20:11:18 +0900 Subject: [PATCH 59/82] update solorun data --- solorun_data/8core/canneal.json | 22 +++++++++++----------- solorun_data/8core/facesim.json | 22 +++++++++++----------- solorun_data/8core/fluidanimate.json | 22 +++++++++++----------- solorun_data/8core/freqmine.json | 22 +++++++++++----------- solorun_data/8core/kmeans.json | 22 +++++++++++----------- solorun_data/8core/nn.json | 22 +++++++++++----------- solorun_data/8core/particlefilter.json | 22 +++++++++++----------- solorun_data/8core/raytrace.json | 22 +++++++++++----------- solorun_data/8core/sp.json | 22 +++++++++++----------- solorun_data/8core/streamcluster.json | 22 +++++++++++----------- 10 files changed, 110 insertions(+), 110 deletions(-) diff --git a/solorun_data/8core/canneal.json b/solorun_data/8core/canneal.json index 00c56bc..9fb74ca 100644 --- a/solorun_data/8core/canneal.json +++ b/solorun_data/8core/canneal.json @@ -1,15 +1,15 @@ { "name": "canneal", - "runtime": 50.875282287597656, - "l2miss": 60792682.894211575, - "l3miss": 41080933.25349302, - "instructions": 2573983428.3233533, - "cycles": 10176539363.932137, - "stall_cycles": 8380277883.153692, - "wall_cycles": 2100813560.0598803, - "intra_coh": 274952.89421157684, - "inter_coh": 0.01996007984031936, + "runtime": 58.566492795944214, + "l2miss": 55642619.19650655, + "l3miss": 36857580.69868996, + "instructions": 2294152843.772926, + "cycles": 7968185474.899564, + "stall_cycles": 6375642091.790394, + "wall_cycles": 2109140231.161572, + "intra_coh": 381280.0349344978, + "inter_coh": 6730.183406113537, "llc_size": 41439544.54618474, - "local_mem": 3492287862.0359282, - "remote_mem": 4319358.72255489 + "local_mem": 2853812912.6288214, + "remote_mem": 7267913.781659389 } \ No newline at end of file diff --git a/solorun_data/8core/facesim.json b/solorun_data/8core/facesim.json index 4c39ab1..4751737 100644 --- a/solorun_data/8core/facesim.json +++ b/solorun_data/8core/facesim.json @@ -1,15 +1,15 @@ { "name": "facesim", - "runtime": 74.43683433532715, - "l2miss": 195316036.70765027, - "l3miss": 29172485.150273222, - "instructions": 31213504312.2541, - "cycles": 16004271485.874317, - "stall_cycles": 3584421050.068306, - "wall_cycles": 2102741078.6202188, - "intra_coh": 3020455.9972677594, - "inter_coh": 10.0, + "runtime": 90.43694233894348, + "l2miss": 166282636.99095023, + "l3miss": 24719981.719457015, + "instructions": 25880380705.06787, + "cycles": 13042011225.690044, + "stall_cycles": 2752212959.6040726, + "wall_cycles": 2112787419.7511313, + "intra_coh": 3078449.954751131, + "inter_coh": 13725.961538461539, "llc_size": 41439544.54618474, - "local_mem": 1403864382.9508197, - "remote_mem": 9712220.327868853 + "local_mem": 1171378898.8235295, + "remote_mem": 13001927.239819003 } \ No newline at end of file diff --git a/solorun_data/8core/fluidanimate.json b/solorun_data/8core/fluidanimate.json index 4d4f2c4..bb1a841 100644 --- a/solorun_data/8core/fluidanimate.json +++ b/solorun_data/8core/fluidanimate.json @@ -1,15 +1,15 @@ { "name": "fluidanimate", - "runtime": 58.332513093948364, - "l2miss": 79196720.6445993, - "l3miss": 50765289.181184664, - "instructions": 34181753854.285713, - "cycles": 19516734522.456444, - "stall_cycles": 2102575306.5156794, - "wall_cycles": 2100813560.0598803, - "intra_coh": 5085844.651567944, - "inter_coh": 0.3484320557491289, + "runtime": 70.6184606552124, + "l2miss": 66363428.112798266, + "l3miss": 41495381.67751265, + "instructions": 28449764402.603035, + "cycles": 15807868146.637745, + "stall_cycles": 3367753444.584237, + "wall_cycles": 2111054726.59436, + "intra_coh": 4452528.806941432, + "inter_coh": 13572.002892263197, "llc_size": 415121465.087108, - "local_mem": 3349890907.8745646, - "remote_mem": 18919809.337979093 + "local_mem": 2700881194.3890095, + "remote_mem": 44826055.35791756 } \ No newline at end of file diff --git a/solorun_data/8core/freqmine.json b/solorun_data/8core/freqmine.json index bb3d798..fe6c887 100644 --- a/solorun_data/8core/freqmine.json +++ b/solorun_data/8core/freqmine.json @@ -1,15 +1,15 @@ { "name": "freqmine", - "runtime": 85.35708165168762, - "l2miss": 58436243.56506239, - "l3miss": 8935877.920380274, - "instructions": 29329754498.395725, - "cycles": 16095461475.448605, - "stall_cycles": 3889697179.524658, - "wall_cycles": 2106684766.0249555, - "intra_coh": 16015597.219251337, - "inter_coh": 0.7961972667855021, + "runtime": 85.67928528785706, + "l2miss": 57477559.75029726, + "l3miss": 9170439.53626635, + "instructions": 29334012536.206898, + "cycles": 16076459416.147444, + "stall_cycles": 3861194938.287753, + "wall_cycles": 2110738194.9702735, + "intra_coh": 14914421.248513674, + "inter_coh": 15639.417360285373, "llc_size": 385291493.877551, - "local_mem": 665578631.681521, - "remote_mem": 6117472.133095663 + "local_mem": 495354752.15219975, + "remote_mem": 6255921.617122473 } \ No newline at end of file diff --git a/solorun_data/8core/kmeans.json b/solorun_data/8core/kmeans.json index 43c65c5..5f5dce8 100644 --- a/solorun_data/8core/kmeans.json +++ b/solorun_data/8core/kmeans.json @@ -1,15 +1,15 @@ { "name": "kmeans", - "runtime": 31.43491005897522, - "l2miss": 192604611.46579805, - "l3miss": 23967362.54071661, - "instructions": 17425663303.094463, - "cycles": 11884070884.723127, - "stall_cycles": 4724979741.205212, - "wall_cycles": 2101847222.2475572, - "intra_coh": 133296346.05863193, - "inter_coh": 2.2801302931596092, + "runtime": 40.35035014152527, + "l2miss": 185422775.6756757, + "l3miss": 19463647.593307592, + "instructions": 13773966502.368084, + "cycles": 9490958570.579151, + "stall_cycles": 3637396000.0, + "wall_cycles": 2109259512.4581723, + "intra_coh": 127064823.73230374, + "inter_coh": 11076.190476190477, "llc_size": 40324710.4, - "local_mem": 1667139777.4592834, - "remote_mem": 5069967.426710098 + "local_mem": 1297946805.868726, + "remote_mem": 6836998.918918919 } \ No newline at end of file diff --git a/solorun_data/8core/nn.json b/solorun_data/8core/nn.json index 42bab74..df5d6a2 100644 --- a/solorun_data/8core/nn.json +++ b/solorun_data/8core/nn.json @@ -1,15 +1,15 @@ { "name": "nn", - "runtime": 65.96833348274231, - "l2miss": 98489406.00308642, - "l3miss": 8380.293209876543, - "instructions": 10738637686.69753, - "cycles": 20138784439.367287, - "stall_cycles": 12751693133.67284, - "wall_cycles": 2103901838.425926, - "intra_coh": 70078248.8425926, - "inter_coh": 1.9290123456790123, + "runtime": 74.64870262145996, + "l2miss": 93579887.43448275, + "l3miss": 1771182.1103448276, + "instructions": 9515542575.682758, + "cycles": 16870611064.193104, + "stall_cycles": 10460715115.931034, + "wall_cycles": 2114596987.3655174, + "intra_coh": 63552416.08275862, + "inter_coh": 22423.144827586206, "llc_size": 40148534.0621118, - "local_mem": 165206344.69135803, - "remote_mem": 724132.3456790124 + "local_mem": 131959673.82068965, + "remote_mem": 14083460.413793104 } \ No newline at end of file diff --git a/solorun_data/8core/particlefilter.json b/solorun_data/8core/particlefilter.json index c837ebd..2d5a3e9 100644 --- a/solorun_data/8core/particlefilter.json +++ b/solorun_data/8core/particlefilter.json @@ -1,15 +1,15 @@ { "name": "particlefilter", - "runtime": 78.08439254760742, - "l2miss": 1892421869.2578125, - "l3miss": 781396.8098958333, - "instructions": 29723062949.791664, - "cycles": 11968647735.755207, - "stall_cycles": 1535074183.3203125, - "wall_cycles": 2102764573.671875, - "intra_coh": 2694.4921875, - "inter_coh": 0.078125, + "runtime": 88.62292170524597, + "l2miss": 1644216187.5072298, + "l3miss": 716752.0994794678, + "instructions": 26348741815.98612, + "cycles": 9531916426.674377, + "stall_cycles": 888746471.1162521, + "wall_cycles": 2112264332.689416, + "intra_coh": 25106.81318681319, + "inter_coh": 13258.438403701562, "llc_size": 41439544.54618474, - "local_mem": 53071360.0, - "remote_mem": 628906.6666666666 + "local_mem": 45910071.67148641, + "remote_mem": 1370608.3053788315 } \ No newline at end of file diff --git a/solorun_data/8core/raytrace.json b/solorun_data/8core/raytrace.json index 29dc9a3..2d1be8d 100644 --- a/solorun_data/8core/raytrace.json +++ b/solorun_data/8core/raytrace.json @@ -1,15 +1,15 @@ { "name": "raytrace", - "runtime": 75.00958156585693, - "l2miss": 37303490.04048583, - "l3miss": 8992488.259109313, - "instructions": 15097455247.354925, - "cycles": 7698034081.48448, - "stall_cycles": 2193381188.205128, - "wall_cycles": 2100733789.3387315, - "intra_coh": 603332.7395411606, - "inter_coh": 7.57085020242915, + "runtime": 96.63842177391052, + "l2miss": 29602739.589689635, + "l3miss": 7303374.339821147, + "instructions": 11771374169.647552, + "cycles": 5870027242.009469, + "stall_cycles": 1570524903.093109, + "wall_cycles": 2107925825.0184112, + "intra_coh": 478563.11415044713, + "inter_coh": 6612.183061546555, "llc_size": 329469194.70985156, - "local_mem": 652059320.48583, - "remote_mem": 2034180.8367071524 + "local_mem": 511230443.26144135, + "remote_mem": 2943431.7096265126 } \ No newline at end of file diff --git a/solorun_data/8core/sp.json b/solorun_data/8core/sp.json index 8fc70e0..3f48687 100644 --- a/solorun_data/8core/sp.json +++ b/solorun_data/8core/sp.json @@ -1,15 +1,15 @@ { "name": "SP", - "runtime": 127.34041666984558, - "l2miss": 811554100.6671963, - "l3miss": 395381549.872915, - "instructions": 31966554430.548054, - "cycles": 20707372417.760128, - "stall_cycles": 8246697940.540112, - "wall_cycles": 2103639492.8911834, - "intra_coh": 9351.890389197775, - "inter_coh": 0.023828435266084195, + "runtime": 139.1068513393402, + "l2miss": 732227921.8221735, + "l3miss": 372740335.3311379, + "instructions": 29445354524.156605, + "cycles": 16824258389.937798, + "stall_cycles": 5539390817.672887, + "wall_cycles": 2112598046.6154408, + "intra_coh": 60722.45883644347, + "inter_coh": 15559.180387852177, "llc_size": 40323737.27388535, - "local_mem": 30224151708.975376, - "remote_mem": 124499140.01588562 + "local_mem": 27333130453.56751, + "remote_mem": 225875950.7647274 } \ No newline at end of file diff --git a/solorun_data/8core/streamcluster.json b/solorun_data/8core/streamcluster.json index 299adb9..d8e33d2 100644 --- a/solorun_data/8core/streamcluster.json +++ b/solorun_data/8core/streamcluster.json @@ -1,15 +1,15 @@ { "name": "streamcluster", - "runtime": 100.40241861343384, - "l2miss": 173347568.74529484, - "l3miss": 147510949.77415305, - "instructions": 10874242181.191969, - "cycles": 20151487489.05897, - "stall_cycles": 14274633174.85571, - "wall_cycles": 2101089271.7691345, - "intra_coh": 14622179.72396487, - "inter_coh": 0.12547051442910917, + "runtime": 108.04844522476196, + "l2miss": 168123959.88711193, + "l3miss": 141570986.0583255, + "instructions": 10224582760.649107, + "cycles": 16460289317.262463, + "stall_cycles": 10945278054.506115, + "wall_cycles": 2110291760.0282218, + "intra_coh": 13866344.04515522, + "inter_coh": 7407.25305738476, "llc_size": 41851667.692307696, - "local_mem": 9555511427.051443, - "remote_mem": 11094249.836888332 + "local_mem": 9034444626.60395, + "remote_mem": 15611501.335841957 } \ No newline at end of file From acee0383e68816e426c593262ea721d9a0b6c34e Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 20:11:31 +0900 Subject: [PATCH 60/82] add solorun data of swaptions --- isolating_controller/solorun_data/swaptions.json | 1 + solorun_data/8core/swaptions.json | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 120000 isolating_controller/solorun_data/swaptions.json create mode 100644 solorun_data/8core/swaptions.json diff --git a/isolating_controller/solorun_data/swaptions.json b/isolating_controller/solorun_data/swaptions.json new file mode 120000 index 0000000..f3191cd --- /dev/null +++ b/isolating_controller/solorun_data/swaptions.json @@ -0,0 +1 @@ +../../solorun_data/8core/swaptions.json \ No newline at end of file diff --git a/solorun_data/8core/swaptions.json b/solorun_data/8core/swaptions.json new file mode 100644 index 0000000..2e44d8c --- /dev/null +++ b/solorun_data/8core/swaptions.json @@ -0,0 +1,15 @@ +{ + "name": "swaptions", + "runtime": 51.841299295425415, + "l2miss": 7875566.221335992, + "l3miss": 70218.84346959123, + "instructions": 30888370965.343967, + "cycles": 16816697239.282152, + "stall_cycles": 3303835114.4366903, + "wall_cycles": 2116106348.8534398, + "intra_coh": 6450924.366899301, + "inter_coh": 21939.86041874377, + "llc_size": 41439544.54618474, + "local_mem": 602434.6161515453, + "remote_mem": 165963.54935194418 +} \ No newline at end of file From 243a55e7fad164e96f210059b849f453002034c6 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 21:24:38 +0900 Subject: [PATCH 61/82] fixed an error occurred when the `curr` metric was zero when generating `MetricDiff`. --- .../metric_container/basic_metric.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 4c152dc..63fb8d5 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -142,8 +142,19 @@ def __repr__(self) -> str: class MetricDiff: def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: self._l3_hit_ratio = curr.l3hit_ratio - prev.l3hit_ratio - self._local_mem_ps = curr.local_mem_ps / prev.local_mem_ps - 1 - self._remote_mem_ps = curr.remote_mem_ps / prev.remote_mem_ps - 1 + + if curr.local_mem_ps == 0: + # TODO: is it fair? + self._local_mem_ps = 1 + else: + self._local_mem_ps = curr.local_mem_ps / prev.local_mem_ps - 1 + + if curr.remote_mem_ps == 0: + # TODO: is it fair? + self._remote_mem_ps = 1 + else: + self._remote_mem_ps = curr.remote_mem_ps / prev.remote_mem_ps - 1 + self._instruction_ps = curr.instruction_ps / prev.instruction_ps - 1 @property From ba1a008bb08bd414ac497142e82cca16fe71df7b Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 22:55:32 +0900 Subject: [PATCH 62/82] cleanup workload.py --- .../isolation/policies/base.py | 28 ------------------- isolating_controller/workload.py | 7 ----- 2 files changed, 35 deletions(-) diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index b70d6a8..236d057 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -118,34 +118,6 @@ def aggr_inst(self) -> float: def in_solorun_profiling(self) -> bool: return self._in_solorun_profile - @property - def most_cont_workload(self) -> Workload: - fg_wl = self.foreground_workload - bg_wl = self.background_workload - - fg_inst_diff = fg_wl.inst_diff - bg_inst_diff = bg_wl.inst_diff - - # FIXME: Below condition is likely to fail due to too little differences between fg and bg - if fg_inst_diff < bg_inst_diff: - return fg_wl - else: - return bg_wl - - @property - def least_cont_workload(self) -> Workload: - fg_wl = self.foreground_workload - bg_wl = self.background_workload - - fg_ipc_diff = fg_wl.inst_diff - bg_ipc_diff = bg_wl.inst_diff - - # FIXME: Below condition is likely to fail due to too little differences between fg and bg - if fg_ipc_diff > bg_ipc_diff: - return fg_wl - else: - return bg_wl - @property def least_mem_bw_workload(self) -> Workload: fg_wl = self.foreground_workload diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 3567272..514b9da 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -1,6 +1,5 @@ # coding: UTF-8 -import logging from collections import deque from itertools import chain from typing import Deque, Iterable, Optional, Set, Tuple @@ -15,7 +14,6 @@ class Workload: """ - Workload class This class abstracts the process and contains the related metrics to represent its characteristics ControlThread schedules the groups of `Workload' instances to enforce their scheduling decisions """ @@ -30,7 +28,6 @@ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interv self._proc_info = psutil.Process(pid) self._perf_info = psutil.Process(perf_pid) - self._inst_diff: float = None self._cgroup_cpuset = CpuSet(self.group_name) self._cgroup_cpu = Cpu(self.group_name) @@ -124,10 +121,6 @@ def perf_interval(self): def is_running(self) -> bool: return self._proc_info.is_running() - @property - def inst_diff(self) -> float: - return self._inst_diff - @property def group_name(self) -> str: return f'{self.name}_{self.pid}' From fa3323586433d9fb1424f4844f5e52eeb3641d36 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 22:56:56 +0900 Subject: [PATCH 63/82] subdivide the criteria of solorun profiling, added solorun related logs --- controller.py | 1 + isolating_controller/isolation/policies/base.py | 17 +++++++++++++---- .../metric_container/basic_metric.py | 3 +++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/controller.py b/controller.py index 3844ce2..2463c23 100755 --- a/controller.py +++ b/controller.py @@ -172,6 +172,7 @@ def _isolate_workloads(self) -> None: # TODO: first expression can lead low reactivity elif iteration_num % int(self._profile_interval / self._interval) == 0 and group.profile_needed(): + logger.info('Starting solorun profiling...') group.start_solorun_profiling() self._solorun_count[group] = iteration_num group.set_idle_isolator() diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index 236d057..23bb4c0 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -199,11 +199,20 @@ def profile_needed(self) -> bool: """ logger = logging.getLogger(__name__) + if self._fg_wl.avg_solorun_data is None: + logger.debug('initialize solorun data') + self._cached_fg_num_threads = self._fg_wl.number_of_threads + return True + + if not self._fg_wl.calc_metric_diff().verify(): + logger.debug(f'fail to verify solorun data. {{{self._fg_wl.calc_metric_diff()}}}') + self._cached_fg_num_threads = self._fg_wl.number_of_threads + return True + cur_num_threads = self._fg_wl.number_of_threads - if self._fg_wl.avg_solorun_data is None \ - or cur_num_threads is not 0 and self._cached_fg_num_threads != cur_num_threads: + if cur_num_threads is not 0 and self._cached_fg_num_threads != cur_num_threads: logger.debug(f'number of threads. cached: {self._cached_fg_num_threads}, current : {cur_num_threads}') self._cached_fg_num_threads = cur_num_threads return True - else: - return False + + return False diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 63fb8d5..c4b1cbb 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -173,6 +173,9 @@ def remote_mem_ps(self) -> float: def instruction_ps(self) -> float: return self._instruction_ps + def verify(self) -> bool: + return self._local_mem_ps <= 1 and self._instruction_ps <= 1 + def __repr__(self) -> str: return f'L3 hit ratio diff: {self._l3_hit_ratio:>6.03f}, ' \ f'Local Memory access diff: {self._local_mem_ps:>6.03f}, ' \ From 3a3c3518a402ce8b37a4777dbafaa9b946da7ba0 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sat, 13 Oct 2018 23:55:00 +0900 Subject: [PATCH 64/82] handle workloads that solorun profiling when deleting a group [2/2] --- controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controller.py b/controller.py index 2463c23..ce40529 100755 --- a/controller.py +++ b/controller.py @@ -239,7 +239,7 @@ def _remove_ended_groups(self) -> None: group.reset() del self._isolation_groups[group] if group.in_solorun_profiling: - group.stop_solorun_profiling() + group.background_workload.resume() del self._solorun_count[group] def run(self) -> None: From 973bb6649e38e898df44897b866ef102ad95f2cf Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 14 Oct 2018 02:54:20 +0900 Subject: [PATCH 65/82] remove pausing foreground during setting solorun profiling --- isolating_controller/isolation/policies/base.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index 23bb4c0..4e2df39 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -155,7 +155,6 @@ def start_solorun_profiling(self) -> None: self._in_solorun_profile = True # suspend all workloads and their perf agents - self._fg_wl.pause() self._bg_wl.pause() self._fg_wl.metrics.clear() @@ -165,14 +164,10 @@ def start_solorun_profiling(self) -> None: isolator.store_cur_config() isolator.reset() - self._fg_wl.resume() - def stop_solorun_profiling(self) -> None: if not self._in_solorun_profile: raise ValueError('Start solorun profiling first!') - self._fg_wl.pause() - logger = logging.getLogger(__name__) logger.debug(f'number of collected solorun data: {len(self._fg_wl.metrics)}') self._fg_wl.avg_solorun_data = BasicMetric.calc_avg(self._fg_wl.metrics) @@ -186,8 +181,6 @@ def stop_solorun_profiling(self) -> None: self._fg_wl.metrics.clear() - # resume all - self._fg_wl.resume() self._bg_wl.resume() self._in_solorun_profile = False From ab84da82c1c90cee83348db9b1f087abed47de93 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 14 Oct 2018 02:56:34 +0900 Subject: [PATCH 66/82] fixed an error occurred when the `curr` metric was zero when generating `MetricDiff` [2/2] --- .../metric_container/basic_metric.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index c4b1cbb..4541b80 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -144,17 +144,13 @@ def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: self._l3_hit_ratio = curr.l3hit_ratio - prev.l3hit_ratio if curr.local_mem_ps == 0: + self._local_mem_ps = 0 + elif prev.local_mem_ps == 0: # TODO: is it fair? - self._local_mem_ps = 1 + self._local_mem_ps = .99 else: self._local_mem_ps = curr.local_mem_ps / prev.local_mem_ps - 1 - if curr.remote_mem_ps == 0: - # TODO: is it fair? - self._remote_mem_ps = 1 - else: - self._remote_mem_ps = curr.remote_mem_ps / prev.remote_mem_ps - 1 - self._instruction_ps = curr.instruction_ps / prev.instruction_ps - 1 @property @@ -165,10 +161,6 @@ def l3_hit_ratio(self) -> float: def local_mem_util_ps(self) -> float: return self._local_mem_ps - @property - def remote_mem_ps(self) -> float: - return self._remote_mem_ps - @property def instruction_ps(self) -> float: return self._instruction_ps From f5e9127eb4b56b0ff8239aa4778cd3866096c8c0 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 14 Oct 2018 13:06:55 +0900 Subject: [PATCH 67/82] reduce duplicated codes --- .../isolation/isolators/affinity.py | 49 ++--------------- .../isolation/isolators/base.py | 53 +++++++++++++++++-- .../isolation/isolators/cache.py | 53 ++----------------- .../isolation/isolators/core.py | 6 +-- .../isolation/isolators/memory.py | 52 ++---------------- .../isolation/isolators/schedule.py | 52 ++---------------- 6 files changed, 68 insertions(+), 197 deletions(-) diff --git a/isolating_controller/isolation/isolators/affinity.py b/isolating_controller/isolation/isolators/affinity.py index 3a22c90..9b88cec 100644 --- a/isolating_controller/isolation/isolators/affinity.py +++ b/isolating_controller/isolation/isolators/affinity.py @@ -3,16 +3,12 @@ import logging from typing import Optional -from isolating_controller.workload import Workload from .base import Isolator -from .. import NextStep from ...metric_container.basic_metric import MetricDiff +from ...workload import Workload class AffinityIsolator(Isolator): - _DOD_THRESHOLD = 0.005 - _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) @@ -20,6 +16,10 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._stored_config: Optional[int] = None + @classmethod + def _get_metric_type_from(cls, metric_diff: MetricDiff) -> float: + return metric_diff.instruction_ps + def strengthen(self) -> 'AffinityIsolator': self._cur_step += 1 return self @@ -43,45 +43,6 @@ def enforce(self) -> None: self._foreground_wl.bound_cores = range(self._foreground_wl.orig_bound_cores[0], self._cur_step + 1) - def _first_decision(self, metric_diff: MetricDiff) -> NextStep: - curr_diff = metric_diff.instruction_ps - - logger = logging.getLogger(__name__) - logger.debug(f'current diff: {curr_diff:>7.4f}') - - if curr_diff < 0: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN - elif curr_diff <= AffinityIsolator._FORCE_THRESHOLD: - return NextStep.STOP - else: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.WEAKEN - - def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: - curr_diff = cur_metric_diff.instruction_ps - prev_diff = prev_metric_diff.instruction_ps - diff_of_diff = curr_diff - prev_diff - - logger = logging.getLogger(__name__) - logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') - logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - - if self.is_min_level or self.is_max_level \ - or abs(diff_of_diff) <= AffinityIsolator._DOD_THRESHOLD \ - or abs(curr_diff) <= AffinityIsolator._DOD_THRESHOLD: - return NextStep.STOP - - elif curr_diff > 0: - return NextStep.WEAKEN - - else: - return NextStep.STRENGTHEN - def reset(self) -> None: if self._foreground_wl.is_running: self._foreground_wl.bound_cores = self._foreground_wl.orig_bound_cores diff --git a/isolating_controller/isolation/isolators/base.py b/isolating_controller/isolation/isolators/base.py index 017aa33..631aa18 100644 --- a/isolating_controller/isolation/isolators/base.py +++ b/isolating_controller/isolation/isolators/base.py @@ -1,7 +1,8 @@ # coding: UTF-8 +import logging from abc import ABCMeta, abstractmethod -from typing import Any, Optional +from typing import Any, ClassVar, Optional from .. import NextStep from ...metric_container.basic_metric import MetricDiff @@ -9,6 +10,9 @@ class Isolator(metaclass=ABCMeta): + _DOD_THRESHOLD: ClassVar[float] = 0.005 + _FORCE_THRESHOLD: ClassVar[float] = 0.1 + def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._prev_metric_diff: MetricDiff = None @@ -69,12 +73,53 @@ def yield_isolation(self) -> None: """ self._is_first_decision = True - @abstractmethod def _first_decision(self, cur_metric_diff: MetricDiff) -> NextStep: - pass + curr_diff = self._get_metric_type_from(cur_metric_diff) + + logger = logging.getLogger(__name__) + logger.debug(f'current diff: {curr_diff:>7.4f}') + + if curr_diff < 0: + if self.is_max_level: + return NextStep.STOP + else: + return NextStep.STRENGTHEN + elif curr_diff <= self._FORCE_THRESHOLD: + return NextStep.STOP + else: + if self.is_min_level: + return NextStep.STOP + else: + return NextStep.WEAKEN - @abstractmethod def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: + curr_diff = self._get_metric_type_from(cur_metric_diff) + prev_diff = self._get_metric_type_from(prev_metric_diff) + diff_of_diff = curr_diff - prev_diff + + logger = logging.getLogger(__name__) + logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') + logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') + + if abs(diff_of_diff) <= self._DOD_THRESHOLD \ + or abs(curr_diff) <= self._DOD_THRESHOLD: + return NextStep.STOP + + elif curr_diff > 0: + if self.is_min_level: + return NextStep.STOP + else: + return NextStep.WEAKEN + + else: + if self.is_max_level: + return NextStep.STOP + else: + return NextStep.STRENGTHEN + + @classmethod + @abstractmethod + def _get_metric_type_from(cls, metric_diff: MetricDiff) -> float: pass def decide_next_step(self) -> NextStep: diff --git a/isolating_controller/isolation/isolators/cache.py b/isolating_controller/isolation/isolators/cache.py index be0b54b..ae72263 100644 --- a/isolating_controller/isolation/isolators/cache.py +++ b/isolating_controller/isolation/isolators/cache.py @@ -4,16 +4,12 @@ from typing import Optional, Tuple from .base import Isolator -from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...utils import ResCtrl, numa_topology from ...workload import Workload class CacheIsolator(Isolator): - _DOD_THRESHOLD = 0.005 - _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) @@ -22,6 +18,10 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._stored_config: Optional[Tuple[int, int]] = None + @classmethod + def _get_metric_type_from(cls, metric_diff: MetricDiff) -> float: + return metric_diff.l3_hit_ratio + def strengthen(self) -> 'CacheIsolator': self._prev_step = self._cur_step @@ -73,51 +73,6 @@ def enforce(self) -> None: masks[self._background_wl.cur_socket_id()] = ResCtrl.gen_mask(self._cur_step) self._background_wl.resctrl.assign_llc(*masks) - def _first_decision(self, metric_diff: MetricDiff) -> NextStep: - curr_diff = metric_diff.l3_hit_ratio - - logger = logging.getLogger(__name__) - logger.debug(f'current diff: {curr_diff:>7.4f}') - - if curr_diff < 0: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN - elif curr_diff <= CacheIsolator._FORCE_THRESHOLD: - return NextStep.STOP - else: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.WEAKEN - - # TODO: consider turn off cache partitioning - def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: - curr_diff = cur_metric_diff.l3_hit_ratio - prev_diff = prev_metric_diff.l3_hit_ratio - diff_of_diff = curr_diff - prev_diff - - logger = logging.getLogger(__name__) - logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') - logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - - if abs(diff_of_diff) <= CacheIsolator._DOD_THRESHOLD \ - or abs(curr_diff) <= CacheIsolator._DOD_THRESHOLD: - return NextStep.STOP - - elif curr_diff > 0: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.WEAKEN - - else: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN - def reset(self) -> None: masks = [ResCtrl.MIN_MASK] * (max(numa_topology.cur_online_nodes()) + 1) diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index 09acc7f..c0d5cc3 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -1,7 +1,7 @@ # coding: UTF-8 import logging -from typing import Optional, Tuple +from typing import Optional, Tuple, ClassVar from .base import Isolator from .. import NextStep, ResourceType @@ -10,9 +10,7 @@ class CoreIsolator(Isolator): - _DOD_THRESHOLD = 0.005 - _FORCE_THRESHOLD = 0.1 - _INST_PS_THRESHOLD = -0.5 + _INST_PS_THRESHOLD: ClassVar[float] = -0.5 def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) diff --git a/isolating_controller/isolation/isolators/memory.py b/isolating_controller/isolation/isolators/memory.py index e48e6f6..8dcefe3 100644 --- a/isolating_controller/isolation/isolators/memory.py +++ b/isolating_controller/isolation/isolators/memory.py @@ -4,16 +4,12 @@ from typing import Optional from .base import Isolator -from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...utils import DVFS from ...workload import Workload class MemoryIsolator(Isolator): - _DOD_THRESHOLD = 0.005 - _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) @@ -21,6 +17,10 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._cur_step: int = DVFS.MAX self._stored_config: Optional[int] = None + @classmethod + def _get_metric_type_from(cls, metric_diff: MetricDiff) -> float: + return metric_diff.local_mem_util_ps + def strengthen(self) -> 'MemoryIsolator': self._cur_step -= DVFS.STEP return self @@ -45,50 +45,6 @@ def enforce(self) -> None: DVFS.set_freq(self._cur_step, self._background_wl.bound_cores) - def _first_decision(self, metric_diff: MetricDiff) -> NextStep: - curr_diff = metric_diff.local_mem_util_ps - - logger = logging.getLogger(__name__) - logger.debug(f'current diff: {curr_diff:>7.4f}') - - if curr_diff < 0: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN - elif curr_diff <= MemoryIsolator._FORCE_THRESHOLD: - return NextStep.STOP - else: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.WEAKEN - - def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: - curr_diff = cur_metric_diff.local_mem_util_ps - prev_diff = prev_metric_diff.local_mem_util_ps - diff_of_diff = curr_diff - prev_diff - - logger = logging.getLogger(__name__) - logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') - logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - - if abs(diff_of_diff) <= MemoryIsolator._DOD_THRESHOLD \ - or abs(curr_diff) <= MemoryIsolator._DOD_THRESHOLD: - return NextStep.STOP - - elif curr_diff > 0: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.WEAKEN - - else: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN - def reset(self) -> None: DVFS.set_freq(DVFS.MAX, self._background_wl.orig_bound_cores) diff --git a/isolating_controller/isolation/isolators/schedule.py b/isolating_controller/isolation/isolators/schedule.py index 616b6e6..6db2431 100644 --- a/isolating_controller/isolation/isolators/schedule.py +++ b/isolating_controller/isolation/isolators/schedule.py @@ -4,15 +4,11 @@ from typing import Optional from .base import Isolator -from .. import NextStep from ...metric_container.basic_metric import MetricDiff from ...workload import Workload class SchedIsolator(Isolator): - _DOD_THRESHOLD = 0.005 - _FORCE_THRESHOLD = 0.1 - def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: super().__init__(foreground_wl, background_wl) @@ -21,6 +17,10 @@ def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._stored_config: Optional[int] = None + @classmethod + def _get_metric_type_from(cls, metric_diff: MetricDiff) -> float: + return metric_diff.local_mem_util_ps + def strengthen(self) -> 'SchedIsolator': self._cur_step += 1 return self @@ -46,50 +46,6 @@ def enforce(self) -> None: # FIXME: hard coded self._background_wl.bound_cores = range(self._cur_step, self._background_wl.orig_bound_cores[-1] + 1) - def _first_decision(self, metric_diff: MetricDiff) -> NextStep: - curr_diff = metric_diff.local_mem_util_ps - - logger = logging.getLogger(__name__) - logger.debug(f'current diff: {curr_diff:>7.4f}') - - if curr_diff < 0: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN - elif curr_diff <= SchedIsolator._FORCE_THRESHOLD: - return NextStep.STOP - else: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.WEAKEN - - def _monitoring_result(self, prev_metric_diff: MetricDiff, cur_metric_diff: MetricDiff) -> NextStep: - curr_diff = cur_metric_diff.local_mem_util_ps - prev_diff = prev_metric_diff.local_mem_util_ps - diff_of_diff = curr_diff - prev_diff - - logger = logging.getLogger(__name__) - logger.debug(f'diff of diff is {diff_of_diff:>7.4f}') - logger.debug(f'current diff: {curr_diff:>7.4f}, previous diff: {prev_diff:>7.4f}') - - if abs(diff_of_diff) <= SchedIsolator._DOD_THRESHOLD \ - or abs(curr_diff) <= SchedIsolator._DOD_THRESHOLD: - return NextStep.STOP - - elif curr_diff > 0: - if self.is_min_level: - return NextStep.STOP - else: - return NextStep.WEAKEN - - else: - if self.is_max_level: - return NextStep.STOP - else: - return NextStep.STRENGTHEN - def reset(self) -> None: if self._background_wl.is_running: self._background_wl.bound_cores = self._background_wl.orig_bound_cores From f717868c401e4cbb3b0f31588ecb9828e857621b Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 14 Oct 2018 13:07:09 +0900 Subject: [PATCH 68/82] add ClassVar annotation --- isolating_controller/isolation/policies/base.py | 4 ++-- .../policies/defensive_with_violation.py | 3 ++- .../isolation/policies/greedy_with_violation.py | 3 ++- isolating_controller/utils/cgroup/base.py | 6 +++--- isolating_controller/utils/cgroup/cpu.py | 3 ++- isolating_controller/utils/cgroup/cpuset.py | 4 ++-- isolating_controller/utils/dvfs.py | 8 ++++---- isolating_controller/utils/resctrl.py | 16 ++++++++-------- 8 files changed, 25 insertions(+), 22 deletions(-) diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index 4e2df39..68acc8c 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -2,7 +2,7 @@ import logging from abc import ABCMeta, abstractmethod -from typing import Dict, Type +from typing import ClassVar, Dict, Type from .. import ResourceType from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, SchedIsolator @@ -12,7 +12,7 @@ class IsolationPolicy(metaclass=ABCMeta): - _IDLE_ISOLATOR: IdleIsolator = IdleIsolator() + _IDLE_ISOLATOR: ClassVar[IdleIsolator] = IdleIsolator() def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._fg_wl = fg_wl diff --git a/isolating_controller/isolation/policies/defensive_with_violation.py b/isolating_controller/isolation/policies/defensive_with_violation.py index 6fae5ac..bacea21 100644 --- a/isolating_controller/isolation/policies/defensive_with_violation.py +++ b/isolating_controller/isolation/policies/defensive_with_violation.py @@ -1,6 +1,7 @@ # coding: UTF-8 import logging +from typing import ClassVar from .defensive import DefensivePolicy from .. import ResourceType @@ -9,7 +10,7 @@ class DefensiveWViolationPolicy(DefensivePolicy): - VIOLATION_THRESHOLD = 3 + VIOLATION_THRESHOLD: ClassVar[int] = 3 def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: super().__init__(fg_wl, bg_wl) diff --git a/isolating_controller/isolation/policies/greedy_with_violation.py b/isolating_controller/isolation/policies/greedy_with_violation.py index d8cc79d..25da150 100644 --- a/isolating_controller/isolation/policies/greedy_with_violation.py +++ b/isolating_controller/isolation/policies/greedy_with_violation.py @@ -1,6 +1,7 @@ # coding: UTF-8 import logging +from typing import ClassVar from .greedy import GreedyPolicy from .. import ResourceType @@ -9,7 +10,7 @@ class GreedyWViolationPolicy(GreedyPolicy): - VIOLATION_THRESHOLD = 3 + VIOLATION_THRESHOLD: ClassVar[int] = 3 def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: super().__init__(fg_wl, bg_wl) diff --git a/isolating_controller/utils/cgroup/base.py b/isolating_controller/utils/cgroup/base.py index 9c27abf..649c47f 100644 --- a/isolating_controller/utils/cgroup/base.py +++ b/isolating_controller/utils/cgroup/base.py @@ -5,12 +5,12 @@ import os import subprocess from abc import ABCMeta -from typing import Iterable +from typing import ClassVar, Iterable class BaseCgroup(metaclass=ABCMeta): - MOUNT_POINT = '/sys/fs/cgroup' - CONTROLLER = str() + MOUNT_POINT: ClassVar[str] = '/sys/fs/cgroup' + CONTROLLER: ClassVar[str] = str() def __init__(self, group_name: str) -> None: self._group_name: str = group_name diff --git a/isolating_controller/utils/cgroup/cpu.py b/isolating_controller/utils/cgroup/cpu.py index 889cdff..481f38c 100644 --- a/isolating_controller/utils/cgroup/cpu.py +++ b/isolating_controller/utils/cgroup/cpu.py @@ -2,12 +2,13 @@ import subprocess +from typing import ClassVar from .base import BaseCgroup class Cpu(BaseCgroup): - CONTROLLER = 'cpu' + CONTROLLER: ClassVar[str] = 'cpu' def limit_cpu_quota(self, quota: int, period: int) -> None: subprocess.check_call(args=('cgset', '-r', f'cpu.cfs_quota_us={quota}', self._group_name)) diff --git a/isolating_controller/utils/cgroup/cpuset.py b/isolating_controller/utils/cgroup/cpuset.py index 086e2c0..17515d4 100644 --- a/isolating_controller/utils/cgroup/cpuset.py +++ b/isolating_controller/utils/cgroup/cpuset.py @@ -2,14 +2,14 @@ import subprocess -from typing import Iterable, Set +from typing import ClassVar, Iterable, Set from .base import BaseCgroup from ..hyphen import convert_to_set class CpuSet(BaseCgroup): - CONTROLLER = 'cpuset' + CONTROLLER: ClassVar[str] = 'cpuset' def assign_cpus(self, core_set: Iterable[int]) -> None: core_ids = ','.join(map(str, core_set)) diff --git a/isolating_controller/utils/dvfs.py b/isolating_controller/utils/dvfs.py index 6ee1f88..221e2c9 100644 --- a/isolating_controller/utils/dvfs.py +++ b/isolating_controller/utils/dvfs.py @@ -2,15 +2,15 @@ import subprocess from pathlib import Path -from typing import Iterable +from typing import ClassVar, Iterable from isolating_controller.utils.cgroup import CpuSet class DVFS: - MIN = int(Path('/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq').read_text()) - STEP = 100000 - MAX = int(Path('/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq').read_text()) + MIN: ClassVar[int] = int(Path('/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq').read_text()) + STEP: ClassVar[int] = 100000 + MAX: ClassVar[int] = int(Path('/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq').read_text()) def __init__(self, group_name): self._group_name: str = group_name diff --git a/isolating_controller/utils/resctrl.py b/isolating_controller/utils/resctrl.py index 1ae14be..6c15a0f 100644 --- a/isolating_controller/utils/resctrl.py +++ b/isolating_controller/utils/resctrl.py @@ -3,7 +3,7 @@ import re import subprocess from pathlib import Path -from typing import List, Pattern, Tuple +from typing import ClassVar, List, Pattern, Tuple def len_of_mask(mask: str) -> int: @@ -20,13 +20,13 @@ def bits_to_mask(bits: int) -> str: class ResCtrl: - MOUNT_POINT: Path = Path('/sys/fs/resctrl') - MAX_MASK: str = Path('/sys/fs/resctrl/info/L3/cbm_mask').read_text(encoding='ASCII').strip() - MAX_BITS: int = len_of_mask((MOUNT_POINT / 'info' / 'L3' / 'cbm_mask').read_text()) - MIN_BITS: int = int((MOUNT_POINT / 'info' / 'L3' / 'min_cbm_bits').read_text()) - MIN_MASK: str = bits_to_mask(MIN_BITS) - STEP = 1 - _read_regex: Pattern = re.compile(r'L3:((\d+=[0-9a-fA-F]+;?)*)', re.MULTILINE) + MOUNT_POINT: ClassVar[Path] = Path('/sys/fs/resctrl') + MAX_MASK: ClassVar[str] = Path('/sys/fs/resctrl/info/L3/cbm_mask').read_text(encoding='ASCII').strip() + MAX_BITS: ClassVar[int] = len_of_mask((MOUNT_POINT / 'info' / 'L3' / 'cbm_mask').read_text()) + MIN_BITS: ClassVar[int] = int((MOUNT_POINT / 'info' / 'L3' / 'min_cbm_bits').read_text()) + MIN_MASK: ClassVar[str] = bits_to_mask(MIN_BITS) + STEP: ClassVar[int] = 1 + _read_regex: ClassVar[Pattern] = re.compile(r'L3:((\d+=[0-9a-fA-F]+;?)*)', re.MULTILINE) def __init__(self, group_name: str) -> None: self._group_name: str = group_name From ff03519a6e01a5dfce10d71a054cd0e914b44654 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 14 Oct 2018 13:20:47 +0900 Subject: [PATCH 69/82] subdivide the criteria of solorun profiling [2/2] --- isolating_controller/isolation/policies/base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index 68acc8c..f2d4938 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -168,6 +168,8 @@ def stop_solorun_profiling(self) -> None: if not self._in_solorun_profile: raise ValueError('Start solorun profiling first!') + self._cached_fg_num_threads = self._fg_wl.number_of_threads + logger = logging.getLogger(__name__) logger.debug(f'number of collected solorun data: {len(self._fg_wl.metrics)}') self._fg_wl.avg_solorun_data = BasicMetric.calc_avg(self._fg_wl.metrics) @@ -194,18 +196,15 @@ def profile_needed(self) -> bool: if self._fg_wl.avg_solorun_data is None: logger.debug('initialize solorun data') - self._cached_fg_num_threads = self._fg_wl.number_of_threads return True if not self._fg_wl.calc_metric_diff().verify(): logger.debug(f'fail to verify solorun data. {{{self._fg_wl.calc_metric_diff()}}}') - self._cached_fg_num_threads = self._fg_wl.number_of_threads return True cur_num_threads = self._fg_wl.number_of_threads if cur_num_threads is not 0 and self._cached_fg_num_threads != cur_num_threads: logger.debug(f'number of threads. cached: {self._cached_fg_num_threads}, current : {cur_num_threads}') - self._cached_fg_num_threads = cur_num_threads return True return False From 4deb7507bdcbc50cee9ad9ee591d144aaf820432 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 14 Oct 2018 13:36:27 +0900 Subject: [PATCH 70/82] reduce duplicated codes [2/2] --- isolating_controller/isolation/isolators/idle.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/isolating_controller/isolation/isolators/idle.py b/isolating_controller/isolation/isolators/idle.py index 462657d..aa4b8ea 100644 --- a/isolating_controller/isolation/isolators/idle.py +++ b/isolating_controller/isolation/isolators/idle.py @@ -2,12 +2,17 @@ from .base import Isolator from .. import NextStep +from ...metric_container.basic_metric import MetricDiff class IdleIsolator(Isolator): def __init__(self) -> None: pass + @classmethod + def _get_metric_type_from(cls, metric_diff: MetricDiff) -> float: + pass + def strengthen(self) -> 'Isolator': pass From 4c2d4ae43860baf39991fbf29d4957113dfa52bb Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Sun, 14 Oct 2018 13:38:38 +0900 Subject: [PATCH 71/82] split controller.py into polling_thread.py --- controller.py | 122 ++----------------------------- isolating_controller/workload.py | 2 +- polling_thread.py | 113 ++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 115 deletions(-) create mode 100644 polling_thread.py diff --git a/controller.py b/controller.py index ce40529..662c5f9 100755 --- a/controller.py +++ b/controller.py @@ -3,139 +3,29 @@ import argparse import datetime -import functools -import json import logging import os import subprocess import sys import time -from threading import Thread from typing import Dict, Optional -import pika import psutil -from pika import BasicProperties -from pika.adapters.blocking_connection import BlockingChannel -from pika.spec import Basic import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator from isolating_controller.isolation.policies import GreedyWViolationPolicy, IsolationPolicy -from isolating_controller.metric_container.basic_metric import BasicMetric -from isolating_controller.workload import Workload from pending_queue import PendingQueue +from polling_thread import PollingThread from swap_iso import SwapIsolator MIN_PYTHON = (3, 6) -class Singleton(type): - _instances = {} - - def __call__(cls, *args, **kwargs): - if cls not in cls._instances: - cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) - return cls._instances[cls] - - -class MainController(metaclass=Singleton): +class Controller: def __init__(self, metric_buf_size: int) -> None: - self._metric_buf_size = metric_buf_size - - self._rmq_host = 'localhost' - self._rmq_creation_queue = 'workload_creation' - - self._pending_wl = PendingQueue(GreedyWViolationPolicy) - self._control_thread = ControlThread(self._pending_wl) - - def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: - ch.basic_ack(method.delivery_tag) - - arr = body.decode().strip().split(',') - - logger = logging.getLogger('monitoring.workload_creation') - logger.debug(f'{arr} is received from workload_creation queue') - - if len(arr) != 5: - return - - wl_identifier, wl_type, pid, perf_pid, perf_interval = arr - pid = int(pid) - perf_pid = int(perf_pid) - perf_interval = int(perf_interval) - item = wl_identifier.split('_') - wl_name = item[0] - - if not psutil.pid_exists(pid): - return - - workload = Workload(wl_name, wl_type, pid, perf_pid, perf_interval) - if wl_type == 'bg': - logger.info(f'{workload} is background process') - else: - logger.info(f'{workload} is foreground process') - - self._pending_wl.add(workload) - - wl_queue_name = '{}({})'.format(wl_name, pid) - ch.queue_declare(wl_queue_name) - ch.basic_consume(functools.partial(self._cbk_wl_monitor, workload), wl_queue_name) - - def _cbk_wl_monitor(self, workload: Workload, - ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: - metric = json.loads(body.decode()) - ch.basic_ack(method.delivery_tag) - - item = BasicMetric(metric['l2miss'], - metric['l3miss'], - metric['instructions'], - metric['cycles'], - metric['stall_cycles'], - metric['wall_cycles'], - metric['intra_coh'], - metric['inter_coh'], - metric['llc_size'], - metric['local_mem'], - metric['remote_mem'], - workload.perf_interval) - - logger = logging.getLogger(f'monitoring.metric.{workload}') - logger.debug(f'{metric} is given from ') - - metric_que = workload.metrics - - if len(metric_que) == self._metric_buf_size: - metric_que.pop() - - metric_que.appendleft(item) - - def run(self) -> None: - logger = logging.getLogger('monitoring') - - self._control_thread.start() - - connection = pika.BlockingConnection(pika.ConnectionParameters(host=self._rmq_host)) - channel = connection.channel() - - channel.queue_declare(self._rmq_creation_queue) - channel.basic_consume(self._cbk_wl_creation, self._rmq_creation_queue) - - try: - logger.debug('starting consuming thread') - channel.start_consuming() - - except KeyboardInterrupt: - channel.close() - connection.close() - - -class ControlThread(Thread): - def __init__(self, pending_queue: PendingQueue) -> None: - super().__init__(daemon=True) - - self._pending_queue: PendingQueue = pending_queue + self._pending_queue: PendingQueue = PendingQueue(GreedyWViolationPolicy) self._interval: float = 0.2 # scheduling interval (sec) self._profile_interval: float = 1.0 # check interval for phase change (sec) @@ -144,6 +34,8 @@ def __init__(self, pending_queue: PendingQueue) -> None: self._isolation_groups: Dict[IsolationPolicy, int] = dict() + self._polling_thread = PollingThread(metric_buf_size, self._pending_queue) + # Swapper init self._swapper: SwapIsolator = SwapIsolator(self._isolation_groups) @@ -243,6 +135,8 @@ def _remove_ended_groups(self) -> None: del self._solorun_count[group] def run(self) -> None: + self._polling_thread.start() + logger = logging.getLogger(__name__) logger.info('starting isolation loop') @@ -284,7 +178,7 @@ def main() -> None: monitoring_logger.addHandler(stream_handler) monitoring_logger.addHandler(file_handler) - controller = MainController(args.buf_size) + controller = Controller(args.buf_size) controller.run() diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index 514b9da..b540e6e 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -15,7 +15,7 @@ class Workload: """ This class abstracts the process and contains the related metrics to represent its characteristics - ControlThread schedules the groups of `Workload' instances to enforce their scheduling decisions + Controller schedules the groups of `Workload' instances to enforce their scheduling decisions """ def __init__(self, name: str, wl_type: str, pid: int, perf_pid: int, perf_interval: int) -> None: diff --git a/polling_thread.py b/polling_thread.py new file mode 100644 index 0000000..aab6e5f --- /dev/null +++ b/polling_thread.py @@ -0,0 +1,113 @@ +# coding: UTF-8 + +import functools +import json +import logging +from threading import Thread + +import pika +import psutil +from pika import BasicProperties +from pika.adapters.blocking_connection import BlockingChannel +from pika.spec import Basic + +from isolating_controller.metric_container.basic_metric import BasicMetric +from isolating_controller.workload import Workload +from pending_queue import PendingQueue + + +class Singleton(type): + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +class PollingThread(Thread, metaclass=Singleton): + def __init__(self, metric_buf_size: int, pending_queue: PendingQueue) -> None: + super().__init__(daemon=True) + self._metric_buf_size = metric_buf_size + + self._rmq_host = 'localhost' + self._rmq_creation_queue = 'workload_creation' + + self._pending_wl = pending_queue + + def _cbk_wl_creation(self, ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: + ch.basic_ack(method.delivery_tag) + + arr = body.decode().strip().split(',') + + logger = logging.getLogger('monitoring.workload_creation') + logger.debug(f'{arr} is received from workload_creation queue') + + if len(arr) != 5: + return + + wl_identifier, wl_type, pid, perf_pid, perf_interval = arr + pid = int(pid) + perf_pid = int(perf_pid) + perf_interval = int(perf_interval) + item = wl_identifier.split('_') + wl_name = item[0] + + if not psutil.pid_exists(pid): + return + + workload = Workload(wl_name, wl_type, pid, perf_pid, perf_interval) + if wl_type == 'bg': + logger.info(f'{workload} is background process') + else: + logger.info(f'{workload} is foreground process') + + self._pending_wl.add(workload) + + wl_queue_name = '{}({})'.format(wl_name, pid) + ch.queue_declare(wl_queue_name) + ch.basic_consume(functools.partial(self._cbk_wl_monitor, workload), wl_queue_name) + + def _cbk_wl_monitor(self, workload: Workload, + ch: BlockingChannel, method: Basic.Deliver, _: BasicProperties, body: bytes) -> None: + metric = json.loads(body.decode()) + ch.basic_ack(method.delivery_tag) + + item = BasicMetric(metric['l2miss'], + metric['l3miss'], + metric['instructions'], + metric['cycles'], + metric['stall_cycles'], + metric['wall_cycles'], + metric['intra_coh'], + metric['inter_coh'], + metric['llc_size'], + metric['local_mem'], + metric['remote_mem'], + workload.perf_interval) + + logger = logging.getLogger(f'monitoring.metric.{workload}') + logger.debug(f'{metric} is given from ') + + metric_que = workload.metrics + + if len(metric_que) == self._metric_buf_size: + metric_que.pop() + + metric_que.appendleft(item) + + def run(self) -> None: + connection = pika.BlockingConnection(pika.ConnectionParameters(host=self._rmq_host)) + channel = connection.channel() + + channel.queue_declare(self._rmq_creation_queue) + channel.basic_consume(self._cbk_wl_creation, self._rmq_creation_queue) + + try: + logger = logging.getLogger('monitoring') + logger.debug('starting consuming thread') + channel.start_consuming() + + except KeyboardInterrupt: + channel.close() + connection.close() From b2b06ffa60cc129e5cb8e6ceb27432ca808e28c1 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Mon, 15 Oct 2018 21:02:43 +0900 Subject: [PATCH 72/82] add aggressive policy --- controller.py | 4 +- .../isolation/policies/__init__.py | 2 + .../isolation/policies/aggressive.py | 52 ++++++++++++++++++ .../policies/aggressive_with_violation.py | 55 +++++++++++++++++++ .../isolation/policies/base.py | 27 ++++----- 5 files changed, 121 insertions(+), 19 deletions(-) create mode 100644 isolating_controller/isolation/policies/aggressive.py create mode 100644 isolating_controller/isolation/policies/aggressive_with_violation.py diff --git a/controller.py b/controller.py index 662c5f9..b374fbc 100755 --- a/controller.py +++ b/controller.py @@ -15,7 +15,7 @@ import isolating_controller from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator -from isolating_controller.isolation.policies import GreedyWViolationPolicy, IsolationPolicy +from isolating_controller.isolation.policies import AggressiveWViolationPolicy, IsolationPolicy from pending_queue import PendingQueue from polling_thread import PollingThread from swap_iso import SwapIsolator @@ -25,7 +25,7 @@ class Controller: def __init__(self, metric_buf_size: int) -> None: - self._pending_queue: PendingQueue = PendingQueue(GreedyWViolationPolicy) + self._pending_queue: PendingQueue = PendingQueue(AggressiveWViolationPolicy) self._interval: float = 0.2 # scheduling interval (sec) self._profile_interval: float = 1.0 # check interval for phase change (sec) diff --git a/isolating_controller/isolation/policies/__init__.py b/isolating_controller/isolation/policies/__init__.py index 6861774..bcd7ef3 100644 --- a/isolating_controller/isolation/policies/__init__.py +++ b/isolating_controller/isolation/policies/__init__.py @@ -1,5 +1,7 @@ # coding: UTF-8 +from .aggressive import AggressivePolicy +from .aggressive_with_violation import AggressiveWViolationPolicy from .base import IsolationPolicy from .defensive import DefensivePolicy from .defensive_cpu import DefensiveCPUPolicy diff --git a/isolating_controller/isolation/policies/aggressive.py b/isolating_controller/isolation/policies/aggressive.py new file mode 100644 index 0000000..7b653f8 --- /dev/null +++ b/isolating_controller/isolation/policies/aggressive.py @@ -0,0 +1,52 @@ +# coding: UTF-8 + +import logging + +from .base import IsolationPolicy +from .. import ResourceType +from ..isolators import AffinityIsolator, CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ...workload import Workload + + +class AggressivePolicy(IsolationPolicy): + def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: + super().__init__(fg_wl, bg_wl) + + self._is_mem_isolated = False + + @property + def new_isolator_needed(self) -> bool: + return isinstance(self._cur_isolator, IdleIsolator) + + def choose_next_isolator(self) -> bool: + logger = logging.getLogger(__name__) + logger.debug('looking for new isolation...') + + # if foreground is web server (CPU critical) + if len(self._fg_wl.bound_cores) * 2 < self._fg_wl.number_of_threads: + if AffinityIsolator in self._isolator_map and not self._isolator_map[AffinityIsolator].is_max_level: + self._cur_isolator = self._isolator_map[AffinityIsolator] + logger.info(f'Starting {self._cur_isolator.__class__.__name__}...') + return True + + for resource, diff_value in self.contentious_resources(): + if resource is ResourceType.CACHE: + isolator = self._isolator_map[CacheIsolator] + elif resource is ResourceType.MEMORY: + if self._is_mem_isolated: + isolator = self._isolator_map[SchedIsolator] + self._is_mem_isolated = False + else: + isolator = self._isolator_map[MemoryIsolator] + self._is_mem_isolated = True + else: + raise NotImplementedError(f'Unknown ResourceType: {resource}') + + if diff_value < 0 and not isolator.is_max_level or \ + diff_value > 0 and not isolator.is_min_level: + self._cur_isolator = isolator + logger.info(f'Starting {self._cur_isolator.__class__.__name__}...') + return True + + logger.debug('A new Isolator has not been selected') + return False diff --git a/isolating_controller/isolation/policies/aggressive_with_violation.py b/isolating_controller/isolation/policies/aggressive_with_violation.py new file mode 100644 index 0000000..3682122 --- /dev/null +++ b/isolating_controller/isolation/policies/aggressive_with_violation.py @@ -0,0 +1,55 @@ +# coding: UTF-8 + +import logging +from typing import ClassVar + +from .aggressive import AggressivePolicy +from .. import ResourceType +from ..isolators import AffinityIsolator, CacheIsolator, IdleIsolator, MemoryIsolator, SchedIsolator +from ...workload import Workload + + +class AggressiveWViolationPolicy(AggressivePolicy): + VIOLATION_THRESHOLD: ClassVar[int] = 3 + + def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: + super().__init__(fg_wl, bg_wl) + + self._violation_count: int = 0 + + def _check_violation(self) -> bool: + if isinstance(self._cur_isolator, AffinityIsolator): + return False + + resource: ResourceType = self.contentious_resource() + + return \ + resource is ResourceType.CACHE and not isinstance(self._cur_isolator, CacheIsolator) \ + or resource is ResourceType.MEMORY and not (isinstance(self._cur_isolator, MemoryIsolator) + or isinstance(self._cur_isolator, SchedIsolator)) + + @property + def new_isolator_needed(self) -> bool: + if isinstance(self._cur_isolator, IdleIsolator): + return True + + if self._check_violation(): + logger = logging.getLogger(__name__) + logger.info(f'violation is occurred. current isolator type : {self._cur_isolator.__class__.__name__}') + + self._violation_count += 1 + + if self._violation_count >= AggressiveWViolationPolicy.VIOLATION_THRESHOLD: + logger.info('new isolator is required due to violation') + self.set_idle_isolator() + self._violation_count = 0 + return True + + return False + + def choose_next_isolator(self) -> bool: + if super().choose_next_isolator(): + self._violation_count = 0 + return True + + return False diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index f2d4938..7d3db3d 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -2,7 +2,7 @@ import logging from abc import ABCMeta, abstractmethod -from typing import ClassVar, Dict, Type +from typing import ClassVar, Dict, Tuple, Type from .. import ResourceType from ..isolators import CacheIsolator, IdleIsolator, Isolator, MemoryIsolator, SchedIsolator @@ -52,29 +52,23 @@ def choose_next_isolator(self) -> bool: pass def contentious_resource(self) -> ResourceType: + return self.contentious_resources()[0][0] + + def contentious_resources(self) -> Tuple[Tuple[ResourceType, float], ...]: metric_diff: MetricDiff = self._fg_wl.calc_metric_diff() logger = logging.getLogger(__name__) logger.info(f'foreground : {metric_diff}') logger.info(f'background : {self._bg_wl.calc_metric_diff()}') - if metric_diff.local_mem_util_ps > 0 and metric_diff.l3_hit_ratio > 0: - if metric_diff.l3_hit_ratio > metric_diff.local_mem_util_ps: - return ResourceType.CACHE - else: - return ResourceType.MEMORY - - elif metric_diff.local_mem_util_ps < 0 < metric_diff.l3_hit_ratio: - return ResourceType.MEMORY + resources = ((ResourceType.CACHE, metric_diff.l3_hit_ratio), + (ResourceType.MEMORY, metric_diff.local_mem_util_ps)) - elif metric_diff.l3_hit_ratio < 0 < metric_diff.local_mem_util_ps: - return ResourceType.CACHE + if all(v > 0 for m, v in resources): + return tuple(sorted(resources, key=lambda x: x[1], reverse=True)) else: - if metric_diff.l3_hit_ratio > metric_diff.local_mem_util_ps: - return ResourceType.MEMORY - else: - return ResourceType.CACHE + return tuple(sorted(resources, key=lambda x: x[1])) @property def foreground_workload(self) -> Workload: @@ -153,6 +147,7 @@ def start_solorun_profiling(self) -> None: raise ValueError('Stop the ongoing solorun profiling first!') self._in_solorun_profile = True + self._cached_fg_num_threads = self._fg_wl.number_of_threads # suspend all workloads and their perf agents self._bg_wl.pause() @@ -168,8 +163,6 @@ def stop_solorun_profiling(self) -> None: if not self._in_solorun_profile: raise ValueError('Start solorun profiling first!') - self._cached_fg_num_threads = self._fg_wl.number_of_threads - logger = logging.getLogger(__name__) logger.debug(f'number of collected solorun data: {len(self._fg_wl.metrics)}') self._fg_wl.avg_solorun_data = BasicMetric.calc_avg(self._fg_wl.metrics) From ead5784b9bc194d54218ce700aa94bbe4d38783b Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 16 Oct 2018 14:03:36 +0900 Subject: [PATCH 73/82] added violation of verifying solorun data --- isolating_controller/isolation/policies/base.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index 7d3db3d..17a3454 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -13,6 +13,7 @@ class IsolationPolicy(metaclass=ABCMeta): _IDLE_ISOLATOR: ClassVar[IdleIsolator] = IdleIsolator() + _VERIFY_THRESHOLD: ClassVar[int] = 3 def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._fg_wl = fg_wl @@ -30,6 +31,7 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: self._in_solorun_profile: bool = False self._cached_fg_num_threads: int = fg_wl.number_of_threads + self._solorun_verify_violation_count: int = 0 def __hash__(self) -> int: return id(self) @@ -148,6 +150,7 @@ def start_solorun_profiling(self) -> None: self._in_solorun_profile = True self._cached_fg_num_threads = self._fg_wl.number_of_threads + self._solorun_verify_violation_count = 0 # suspend all workloads and their perf agents self._bg_wl.pause() @@ -192,8 +195,11 @@ def profile_needed(self) -> bool: return True if not self._fg_wl.calc_metric_diff().verify(): - logger.debug(f'fail to verify solorun data. {{{self._fg_wl.calc_metric_diff()}}}') - return True + self._solorun_verify_violation_count += 1 + + if self._solorun_verify_violation_count == self._VERIFY_THRESHOLD: + logger.debug(f'fail to verify solorun data. {{{self._fg_wl.calc_metric_diff()}}}') + return True cur_num_threads = self._fg_wl.number_of_threads if cur_num_threads is not 0 and self._cached_fg_num_threads != cur_num_threads: From bc7c5a19052cc719cbbe0acca90b1ccc6b822fdd Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 16 Oct 2018 14:26:33 +0900 Subject: [PATCH 74/82] fixes constructor of MetricDiff --- isolating_controller/metric_container/basic_metric.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index 4541b80..aa6b44f 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -140,14 +140,20 @@ def __repr__(self) -> str: class MetricDiff: + # FIXME: hard coded + _MAX_MEM_BANDWIDTH_PS = 68 * 1024 * 1024 * 1024 + def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: self._l3_hit_ratio = curr.l3hit_ratio - prev.l3hit_ratio if curr.local_mem_ps == 0: - self._local_mem_ps = 0 + if prev.local_mem_ps == 0: + self._local_mem_ps = 0 + else: + self._local_mem_ps = -prev.local_mem_ps / self._MAX_MEM_BANDWIDTH_PS elif prev.local_mem_ps == 0: # TODO: is it fair? - self._local_mem_ps = .99 + self._local_mem_ps = curr.local_mem_ps / self._MAX_MEM_BANDWIDTH_PS else: self._local_mem_ps = curr.local_mem_ps / prev.local_mem_ps - 1 From ac6c7ed6f7746bdd19b2758742a4272aa428d227 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 16 Oct 2018 17:56:59 +0900 Subject: [PATCH 75/82] add new offline data --- isolating_controller/solorun_data/bt.json | 1 + isolating_controller/solorun_data/ua.json | 1 + solorun_data/8core/bt.json | 15 +++++++++++++++ solorun_data/8core/ua.json | 15 +++++++++++++++ 4 files changed, 32 insertions(+) create mode 120000 isolating_controller/solorun_data/bt.json create mode 120000 isolating_controller/solorun_data/ua.json create mode 100644 solorun_data/8core/bt.json create mode 100644 solorun_data/8core/ua.json diff --git a/isolating_controller/solorun_data/bt.json b/isolating_controller/solorun_data/bt.json new file mode 120000 index 0000000..4707e3c --- /dev/null +++ b/isolating_controller/solorun_data/bt.json @@ -0,0 +1 @@ +../../solorun_data/8core/bt.json \ No newline at end of file diff --git a/isolating_controller/solorun_data/ua.json b/isolating_controller/solorun_data/ua.json new file mode 120000 index 0000000..10506e3 --- /dev/null +++ b/isolating_controller/solorun_data/ua.json @@ -0,0 +1 @@ +../../solorun_data/8core/ua.json \ No newline at end of file diff --git a/solorun_data/8core/bt.json b/solorun_data/8core/bt.json new file mode 100644 index 0000000..b84f233 --- /dev/null +++ b/solorun_data/8core/bt.json @@ -0,0 +1,15 @@ +{ + "name": "BT", + "runtime": 180.0266306400299, + "l2miss": 209131122.35161108, + "l3miss": 112225709.15771621, + "instructions": 37314385197.06614, + "cycles": 16792362436.009045, + "stall_cycles": 3760861210.672696, + "wall_cycles": 2114139368.920294, + "intra_coh": 21035.449406444317, + "inter_coh": 15877.778405879028, + "llc_size": 41439544.54618474, + "local_mem": 8457671237.173544, + "remote_mem": 69865673.44262296 +} \ No newline at end of file diff --git a/solorun_data/8core/ua.json b/solorun_data/8core/ua.json new file mode 100644 index 0000000..b3d611b --- /dev/null +++ b/solorun_data/8core/ua.json @@ -0,0 +1,15 @@ +{ + "name": "UA", + "runtime": 191.63576126098633, + "l2miss": 234905209.9230565, + "l3miss": 161282014.08331123, + "instructions": 20563319157.1186, + "cycles": 15614710154.592731, + "stall_cycles": 5650998669.477315, + "wall_cycles": 2113380014.3857787, + "intra_coh": 354513.00079596706, + "inter_coh": 14184.600689838153, + "llc_size": 41439544.54618474, + "local_mem": 15061261994.75723, + "remote_mem": 95241943.85778722 +} \ No newline at end of file From b20cd8e224855e71298ae13136ecbcfa2e67ac28 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 16 Oct 2018 17:57:13 +0900 Subject: [PATCH 76/82] cleanup policies/base.py --- .../isolation/policies/base.py | 28 ------------------- 1 file changed, 28 deletions(-) diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index f35ed84..26c11f6 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -195,34 +195,6 @@ def safe_to_swap(self) -> bool: def aggr_inst(self) -> float: return self._aggr_inst_diff - @property - def most_cont_workload(self) -> Workload: - fg_wl = self.foreground_workload - bg_wl = self.background_workload - - fg_inst_diff = fg_wl.inst_diff - bg_inst_diff = bg_wl.inst_diff - - # FIXME: Below condition is likely to fail due to too little differences between fg and bg - if fg_inst_diff < bg_inst_diff: - return fg_wl - else: - return bg_wl - - @property - def least_cont_workload(self) -> Workload: - fg_wl = self.foreground_workload - bg_wl = self.background_workload - - fg_ipc_diff = fg_wl.inst_diff - bg_ipc_diff = bg_wl.inst_diff - - # FIXME: Below condition is likely to fail due to too little differences between fg and bg - if fg_ipc_diff > bg_ipc_diff: - return fg_wl - else: - return bg_wl - @property def least_mem_bw_workload(self) -> Workload: fg_wl = self.foreground_workload From 25cc166d917fc11e615a2c68102170aeee10abbf Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 16 Oct 2018 17:58:59 +0900 Subject: [PATCH 77/82] move swap_iso.py and rename it --- controller.py | 2 +- swap_iso.py => isolating_controller/isolation/swapper.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename swap_iso.py => isolating_controller/isolation/swapper.py (97%) diff --git a/controller.py b/controller.py index 6089726..24dbd05 100755 --- a/controller.py +++ b/controller.py @@ -16,9 +16,9 @@ from isolating_controller.isolation import NextStep from isolating_controller.isolation.isolators import Isolator from isolating_controller.isolation.policies import AggressiveWViolationPolicy, IsolationPolicy +from isolating_controller.isolation.swapper import SwapIsolator from pending_queue import PendingQueue from polling_thread import PollingThread -from swap_iso import SwapIsolator MIN_PYTHON = (3, 6) diff --git a/swap_iso.py b/isolating_controller/isolation/swapper.py similarity index 97% rename from swap_iso.py rename to isolating_controller/isolation/swapper.py index 57a12d7..5ddf348 100644 --- a/swap_iso.py +++ b/isolating_controller/isolation/swapper.py @@ -7,8 +7,8 @@ import psutil -from isolating_controller.isolation.policies.base import IsolationPolicy -from isolating_controller.workload import Workload +from .policies.base import IsolationPolicy +from ..workload import Workload class SwapNextStep(IntEnum): From a974af698497c4e7616eb8bf1329fa3449165e07 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 17 Oct 2018 16:49:08 +0900 Subject: [PATCH 78/82] rewrite swapper.py --- controller.py | 2 +- .../isolation/isolators/core.py | 2 +- .../isolation/policies/base.py | 29 +--- isolating_controller/isolation/swapper.py | 125 ++++++++---------- 4 files changed, 55 insertions(+), 103 deletions(-) diff --git a/controller.py b/controller.py index 24dbd05..f63b014 100755 --- a/controller.py +++ b/controller.py @@ -97,7 +97,7 @@ def _isolate_workloads(self) -> None: finally: self._isolation_groups[group] += 1 - if len(tuple(filter(lambda x: x.safe_to_swap, self._isolation_groups.keys()))) < 2: + if len(tuple(g for g in self._isolation_groups if g.safe_to_swap)) >= 2: if self._swapper.swap_is_needed(): self._swapper.do_swap() diff --git a/isolating_controller/isolation/isolators/core.py b/isolating_controller/isolation/isolators/core.py index c0d5cc3..104ef10 100644 --- a/isolating_controller/isolation/isolators/core.py +++ b/isolating_controller/isolation/isolators/core.py @@ -1,7 +1,7 @@ # coding: UTF-8 import logging -from typing import Optional, Tuple, ClassVar +from typing import ClassVar, Optional, Tuple from .base import Isolator from .. import NextStep, ResourceType diff --git a/isolating_controller/isolation/policies/base.py b/isolating_controller/isolation/policies/base.py index 26c11f6..cd74176 100644 --- a/isolating_controller/isolation/policies/base.py +++ b/isolating_controller/isolation/policies/base.py @@ -27,8 +27,6 @@ def __init__(self, fg_wl: Workload, bg_wl: Workload) -> None: )) self._cur_isolator: Isolator = IsolationPolicy._IDLE_ISOLATOR - self._aggr_inst_diff: float = None - self._in_solorun_profile: bool = False self._cached_fg_num_threads: int = fg_wl.number_of_threads self._solorun_verify_violation_count: int = 0 @@ -189,29 +187,4 @@ def profile_needed(self) -> bool: @property def safe_to_swap(self) -> bool: - return not self._in_solorun_profile and len(self._fg_wl.metrics) > 0 - - @property - def aggr_inst(self) -> float: - return self._aggr_inst_diff - - @property - def least_mem_bw_workload(self) -> Workload: - fg_wl = self.foreground_workload - bg_wl = self.background_workload - - fg_mem_bw = fg_wl.metrics[0].local_mem_ps - bg_mem_bw = bg_wl.metrics[0].local_mem_ps - - if fg_mem_bw > bg_mem_bw: - return bg_wl - else: - return fg_wl - - # FIXME: replace to property - def update_aggr_instr(self) -> None: - fg_diff = self._fg_wl.calc_metric_diff() - bg_diff = self._bg_wl.calc_metric_diff() - self._fg_wl._ipc_diff = fg_diff.instruction_ps - self._bg_wl._ipc_diff = bg_diff.instruction_ps - self._aggr_inst_diff = fg_diff.instruction_ps + bg_diff.instruction_ps + return not self._in_solorun_profile and len(self._fg_wl.metrics) > 0 and self._fg_wl.calc_metric_diff().verify() diff --git a/isolating_controller/isolation/swapper.py b/isolating_controller/isolation/swapper.py index 5ddf348..0edba8d 100644 --- a/isolating_controller/isolation/swapper.py +++ b/isolating_controller/isolation/swapper.py @@ -2,24 +2,17 @@ import logging import subprocess -from enum import IntEnum -from typing import Dict, Optional, Set +from typing import Dict, Optional, Set, Tuple import psutil from .policies.base import IsolationPolicy -from ..workload import Workload - - -class SwapNextStep(IntEnum): - OUT = 0 - IN = 1 class SwapIsolator: # FIXME: This threshold needs tests (How small diff is right for swapping workloads?) # "-0.5" means the IPCs of workloads in a group drop 50% compared to solo-run - _IPC_DIFF_THRESHOLD = -0.5 + _INST_DIFF_THRESHOLD = -1 _VIOLATION_THRESHOLD = 5 def __init__(self, isolation_groups: Dict[IsolationPolicy, int]) -> None: @@ -27,19 +20,11 @@ def __init__(self, isolation_groups: Dict[IsolationPolicy, int]) -> None: :param isolation_groups: Dict. Key is the index of group and Value is the group itself """ self._all_groups: Dict[IsolationPolicy, int] = isolation_groups - self._swap_candidates: Dict[SwapNextStep, Workload] = dict() - self._most_cont_group: Optional[IsolationPolicy] = None - self._least_cont_group: Optional[IsolationPolicy] = None - - self._prev_wls: Set[Workload] = set() + self._prev_wls: Set[IsolationPolicy] = set() self._violation_count: int = 0 - def __del__(self): - logger = logging.getLogger(__name__) - logger.info('SwapIsolator is closed...') - - def select_cont_group(self) -> None: + def _select_cont_groups(self) -> Optional[Tuple[IsolationPolicy, IsolationPolicy]]: """ Most contentious group is the group which shows "the LOWEST aggr. ipc diff" Least contentious group is the group which shows "the HIGHEST aggr. ipc diff" @@ -47,90 +32,84 @@ def select_cont_group(self) -> None: Assumption : Swap Isolator swaps workloads between the most cont. group and the least cont. group """ - swap_in_grp: Optional[IsolationPolicy] = None - swap_out_grp: Optional[IsolationPolicy] = None + contentions: Dict[IsolationPolicy, Tuple[float, float]] = { + group: ( + group.foreground_workload.calc_metric_diff().instruction_ps, + group.background_workload.calc_metric_diff().instruction_ps, + ) + for group in self._all_groups.keys() + } - for group in self._all_groups.keys(): - if swap_in_grp is None: - swap_in_grp = group - if swap_out_grp is None: - swap_out_grp = group + # TODO: more efficient implementation + for group1, (g1_fg_cont, g1_bg_cont) in contentions.items(): + for group2, (g2_fg_cont, g2_bg_cont) in contentions.items(): + if group1 == group2: + continue - group.update_aggr_instr() - swap_in_grp = max(swap_in_grp, group, key=lambda x: x.aggr_inst) - swap_out_grp = min(swap_out_grp, group, key=lambda x: x.aggr_inst) + if g1_fg_cont + g1_bg_cont <= self._INST_DIFF_THRESHOLD < g2_fg_cont + g2_bg_cont \ + and g1_fg_cont + g2_bg_cont > self._INST_DIFF_THRESHOLD \ + and g2_fg_cont + g1_bg_cont > self._INST_DIFF_THRESHOLD: + logging.getLogger(__name__).debug(f'{group1} and {group2} is selected as swap candidate') + return group1, group2 - self._most_cont_group = swap_out_grp - self._least_cont_group = swap_in_grp + return None def swap_is_needed(self) -> bool: - self.select_cont_group() - - # FIXME: We used the average ipc diff value (We assume two workloads in a group at most) - avg_min_ipc_diff = self._most_cont_group.aggr_inst / 2 + logger = logging.getLogger(__name__) + groups = self._select_cont_groups() - # TODO: Test the _IPC_DIFF_THRESHOLD - if avg_min_ipc_diff > self._IPC_DIFF_THRESHOLD: + if groups is None: self._prev_wls.clear() self._violation_count = 0 return False if len(self._prev_wls) is 2 \ - and self._most_cont_group.background_workload in self._prev_wls \ - and self._least_cont_group.background_workload in self._prev_wls: + and groups[0] in self._prev_wls \ + and groups[1] in self._prev_wls: self._violation_count += 1 - print( - f'violation count of {self._most_cont_group.background_workload}, ' - f'{self._least_cont_group.background_workload} is {self._violation_count}') - return self._violation_count >= SwapIsolator._VIOLATION_THRESHOLD + logger.debug( + f'violation count of {groups[0].background_workload}, ' + f'{groups[1].background_workload} is {self._violation_count}') + return self._violation_count >= self._VIOLATION_THRESHOLD else: self._prev_wls.clear() - self._prev_wls.add(self._most_cont_group.background_workload) - self._prev_wls.add(self._least_cont_group.background_workload) + self._prev_wls.add(groups[0]) + self._prev_wls.add(groups[1]) self._violation_count = 1 return False def do_swap(self) -> None: - # Enable CPUSET memory migration - self.pre_swap_setup() + logger = logging.getLogger(__name__) + group1, group2 = tuple(self._prev_wls) + logger.info(f'Starting swaption between {group1.background_workload} and {group2.background_workload}...') - out_wl = self._most_cont_group.background_workload - in_wl = self._least_cont_group.background_workload + workload1 = group1.background_workload + workload2 = group2.background_workload - print(f'swap {out_wl}, {in_wl}') + # Enable CPUSET memory migration + workload1.cgroup_cpuset.set_memory_migrate(True) + workload2.cgroup_cpuset.set_memory_migrate(True) try: # Suspend Procs and Enforce Swap Conf. - out_wl.pause() - in_wl.pause() + workload1.pause() + workload2.pause() - in_tmp, out_tmp = in_wl.orig_bound_mems, out_wl.orig_bound_mems - in_wl.orig_bound_mems, out_wl.orig_bound_mems = out_tmp, in_tmp - in_tmp, out_tmp = in_wl.orig_bound_cores, out_wl.orig_bound_cores - in_wl.orig_bound_cores, out_wl.orig_bound_cores = out_tmp, in_tmp + tmp1, tmp2 = workload2.orig_bound_mems, workload1.orig_bound_mems + workload2.orig_bound_mems, workload1.orig_bound_mems = tmp2, tmp1 + tmp1, tmp2 = workload2.orig_bound_cores, workload1.orig_bound_cores + workload2.orig_bound_cores, workload1.orig_bound_cores = tmp2, tmp1 - in_tmp, out_tmp = in_wl.bound_mems, out_wl.bound_mems - in_wl.bound_mems, out_wl.bound_mems = out_tmp, in_tmp - in_tmp, out_tmp = in_wl.bound_cores, out_wl.bound_cores - in_wl.bound_cores, out_wl.bound_cores = out_tmp, in_tmp - - self._most_cont_group.background_workload = in_wl - self._least_cont_group.background_workload = out_wl + group1.background_workload = workload2 + group2.background_workload = workload1 except (psutil.NoSuchProcess, subprocess.CalledProcessError, ProcessLookupError) as e: - print(e) + logger.warning('Error occurred during swaption', e) finally: # Resume Procs - out_wl.resume() - in_wl.resume() + workload1.resume() + workload2.resume() self._violation_count = 0 self._prev_wls.clear() - - def pre_swap_setup(self) -> None: - swap_out_workload = self._most_cont_group.background_workload - swap_in_workload = self._least_cont_group.background_workload - - swap_out_workload.cgroup_cpuset.set_memory_migrate(True) - swap_in_workload.cgroup_cpuset.set_memory_migrate(True) From e3baaf755feb6e65f8a456b2bedbef24005e48b8 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 23 Oct 2018 15:18:24 +0900 Subject: [PATCH 79/82] fixes corner cases of MetricDiff --- isolating_controller/metric_container/basic_metric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index aa6b44f..a94c924 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -150,10 +150,10 @@ def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: if prev.local_mem_ps == 0: self._local_mem_ps = 0 else: - self._local_mem_ps = -prev.local_mem_ps / self._MAX_MEM_BANDWIDTH_PS + self._local_mem_ps = prev.local_mem_ps / self._MAX_MEM_BANDWIDTH_PS elif prev.local_mem_ps == 0: # TODO: is it fair? - self._local_mem_ps = curr.local_mem_ps / self._MAX_MEM_BANDWIDTH_PS + self._local_mem_ps = -curr.local_mem_ps / self._MAX_MEM_BANDWIDTH_PS else: self._local_mem_ps = curr.local_mem_ps / prev.local_mem_ps - 1 From ae7671b653710907fee39b7fb52ee09c50afd23e Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 23 Oct 2018 15:19:13 +0900 Subject: [PATCH 80/82] change threshold of isolation weakening --- isolating_controller/isolation/isolators/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isolating_controller/isolation/isolators/base.py b/isolating_controller/isolation/isolators/base.py index 631aa18..459a538 100644 --- a/isolating_controller/isolation/isolators/base.py +++ b/isolating_controller/isolation/isolators/base.py @@ -11,7 +11,7 @@ class Isolator(metaclass=ABCMeta): _DOD_THRESHOLD: ClassVar[float] = 0.005 - _FORCE_THRESHOLD: ClassVar[float] = 0.1 + _FORCE_THRESHOLD: ClassVar[float] = 0.05 def __init__(self, foreground_wl: Workload, background_wl: Workload) -> None: self._prev_metric_diff: MetricDiff = None From 026396e5a4eb6a078d960b0d09a52fa668b60aee Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Tue, 23 Oct 2018 15:20:06 +0900 Subject: [PATCH 81/82] re-implement swapper --- isolating_controller/isolation/swapper.py | 68 ++++++++++++++++------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/isolating_controller/isolation/swapper.py b/isolating_controller/isolation/swapper.py index 0edba8d..446e31f 100644 --- a/isolating_controller/isolation/swapper.py +++ b/isolating_controller/isolation/swapper.py @@ -2,18 +2,21 @@ import logging import subprocess +import time from typing import Dict, Optional, Set, Tuple import psutil from .policies.base import IsolationPolicy +from ..metric_container.basic_metric import MetricDiff class SwapIsolator: # FIXME: This threshold needs tests (How small diff is right for swapping workloads?) # "-0.5" means the IPCs of workloads in a group drop 50% compared to solo-run _INST_DIFF_THRESHOLD = -1 - _VIOLATION_THRESHOLD = 5 + _VIOLATION_THRESHOLD = 3 + _INTERVAL = 2000 def __init__(self, isolation_groups: Dict[IsolationPolicy, int]) -> None: """ @@ -21,8 +24,9 @@ def __init__(self, isolation_groups: Dict[IsolationPolicy, int]) -> None: """ self._all_groups: Dict[IsolationPolicy, int] = isolation_groups - self._prev_wls: Set[IsolationPolicy] = set() + self._prev_grp: Set[IsolationPolicy] = set() self._violation_count: int = 0 + self._last_swap: int = 0 def _select_cont_groups(self) -> Optional[Tuple[IsolationPolicy, IsolationPolicy]]: """ @@ -31,41 +35,64 @@ def _select_cont_groups(self) -> Optional[Tuple[IsolationPolicy, IsolationPolicy Assumption : Swap Isolator swaps workloads between the most cont. group and the least cont. group """ + logger = logging.getLogger(__name__) + + def calc_benefit(g1_fg_diff: MetricDiff, g1_bg_diff: MetricDiff, + g2_fg_diff: MetricDiff, g2_bg_diff: MetricDiff, + attribute: str) -> float: + g1_fg_cont = getattr(g1_fg_diff, attribute) + g1_bg_cont = getattr(g1_bg_diff, attribute) + g2_fg_cont = getattr(g2_fg_diff, attribute) + g2_bg_cont = getattr(g2_bg_diff, attribute) + + current = abs(g1_fg_cont + g1_bg_cont) + abs(g2_fg_cont + g2_bg_cont) + future = abs(g1_fg_cont + g2_bg_cont) + abs(g2_fg_cont + g1_bg_cont) + benefit = current - future + + logger.debug(f'Calculating swaption benefit. current: {current}, future: {future}, benefit: {benefit}') + return benefit - contentions: Dict[IsolationPolicy, Tuple[float, float]] = { + contentions: Dict[IsolationPolicy, Tuple[MetricDiff, MetricDiff]] = { group: ( - group.foreground_workload.calc_metric_diff().instruction_ps, - group.background_workload.calc_metric_diff().instruction_ps, + group.foreground_workload.calc_metric_diff(), + group.background_workload.calc_metric_diff(), ) for group in self._all_groups.keys() } # TODO: more efficient implementation - for group1, (g1_fg_cont, g1_bg_cont) in contentions.items(): - for group2, (g2_fg_cont, g2_bg_cont) in contentions.items(): + for group1, g1_diffs in contentions.items(): + for group2, g2_diffs in contentions.items(): if group1 == group2: continue - if g1_fg_cont + g1_bg_cont <= self._INST_DIFF_THRESHOLD < g2_fg_cont + g2_bg_cont \ - and g1_fg_cont + g2_bg_cont > self._INST_DIFF_THRESHOLD \ - and g2_fg_cont + g1_bg_cont > self._INST_DIFF_THRESHOLD: - logging.getLogger(__name__).debug(f'{group1} and {group2} is selected as swap candidate') + group1.background_workload.cgroup_cpuset.read_cpus() + instr_benefit = calc_benefit(*g1_diffs, *g2_diffs, attribute='instruction_ps') + l3_benefit = calc_benefit(*g1_diffs, *g2_diffs, attribute='l3_hit_ratio') + mem_benefit = calc_benefit(*g1_diffs, *g2_diffs, attribute='local_mem_util_ps') + + if instr_benefit + l3_benefit + mem_benefit > 0.1: + logger.debug(f'{group1} and {group2} is selected as swap candidate') return group1, group2 return None def swap_is_needed(self) -> bool: + if time.time() - self._last_swap <= self._INTERVAL / 1_000: + return False + logger = logging.getLogger(__name__) groups = self._select_cont_groups() if groups is None: - self._prev_wls.clear() + self._prev_grp.clear() self._violation_count = 0 + logger.debug(f'violation count of swaption is cleared') return False - if len(self._prev_wls) is 2 \ - and groups[0] in self._prev_wls \ - and groups[1] in self._prev_wls: + if len(self._prev_grp) is 2 \ + and groups[0] in self._prev_grp \ + and groups[1] in self._prev_grp: self._violation_count += 1 logger.debug( f'violation count of {groups[0].background_workload}, ' @@ -73,15 +100,15 @@ def swap_is_needed(self) -> bool: return self._violation_count >= self._VIOLATION_THRESHOLD else: - self._prev_wls.clear() - self._prev_wls.add(groups[0]) - self._prev_wls.add(groups[1]) + self._prev_grp.clear() + self._prev_grp.add(groups[0]) + self._prev_grp.add(groups[1]) self._violation_count = 1 return False def do_swap(self) -> None: logger = logging.getLogger(__name__) - group1, group2 = tuple(self._prev_wls) + group1, group2 = tuple(self._prev_grp) logger.info(f'Starting swaption between {group1.background_workload} and {group2.background_workload}...') workload1 = group1.background_workload @@ -112,4 +139,5 @@ def do_swap(self) -> None: workload1.resume() workload2.resume() self._violation_count = 0 - self._prev_wls.clear() + self._prev_grp.clear() + self._last_swap = time.time() From 72850156e4a5d01a3b6ca9f12836b6ef708634f7 Mon Sep 17 00:00:00 2001 From: Byeonghoon Yoo Date: Wed, 24 Oct 2018 17:12:52 +0900 Subject: [PATCH 82/82] fixes criteria of swapper --- isolating_controller/isolation/swapper.py | 64 +++++++++---------- .../metric_container/basic_metric.py | 6 +- isolating_controller/workload.py | 4 +- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/isolating_controller/isolation/swapper.py b/isolating_controller/isolation/swapper.py index 446e31f..8ab4b9e 100644 --- a/isolating_controller/isolation/swapper.py +++ b/isolating_controller/isolation/swapper.py @@ -37,41 +37,39 @@ def _select_cont_groups(self) -> Optional[Tuple[IsolationPolicy, IsolationPolicy """ logger = logging.getLogger(__name__) - def calc_benefit(g1_fg_diff: MetricDiff, g1_bg_diff: MetricDiff, - g2_fg_diff: MetricDiff, g2_bg_diff: MetricDiff, - attribute: str) -> float: - g1_fg_cont = getattr(g1_fg_diff, attribute) - g1_bg_cont = getattr(g1_bg_diff, attribute) - g2_fg_cont = getattr(g2_fg_diff, attribute) - g2_bg_cont = getattr(g2_bg_diff, attribute) - - current = abs(g1_fg_cont + g1_bg_cont) + abs(g2_fg_cont + g2_bg_cont) - future = abs(g1_fg_cont + g2_bg_cont) + abs(g2_fg_cont + g1_bg_cont) - benefit = current - future - - logger.debug(f'Calculating swaption benefit. current: {current}, future: {future}, benefit: {benefit}') - return benefit - - contentions: Dict[IsolationPolicy, Tuple[MetricDiff, MetricDiff]] = { - group: ( - group.foreground_workload.calc_metric_diff(), - group.background_workload.calc_metric_diff(), - ) - for group in self._all_groups.keys() - } + contentions: Tuple[Tuple[IsolationPolicy, MetricDiff], ...] = tuple( + (group, group.foreground_workload.calc_metric_diff()) + for group in self._all_groups.keys() + ) # TODO: more efficient implementation - for group1, g1_diffs in contentions.items(): - for group2, g2_diffs in contentions.items(): - if group1 == group2: - continue - - group1.background_workload.cgroup_cpuset.read_cpus() - instr_benefit = calc_benefit(*g1_diffs, *g2_diffs, attribute='instruction_ps') - l3_benefit = calc_benefit(*g1_diffs, *g2_diffs, attribute='l3_hit_ratio') - mem_benefit = calc_benefit(*g1_diffs, *g2_diffs, attribute='local_mem_util_ps') - - if instr_benefit + l3_benefit + mem_benefit > 0.1: + for idx, (group1, g1_fg_diff) in enumerate(contentions): + for group2, g2_fg_diff in contentions[idx + 1:]: + g1_bg_curr_cores = len(group1.background_workload.cgroup_cpuset.read_cpus()) + g2_bg_curr_cores = len(group2.background_workload.cgroup_cpuset.read_cpus()) + + g1_fg_cont = g1_fg_diff.instruction_ps + g2_fg_cont = g2_fg_diff.instruction_ps + + g1_bg_cont = group1.background_workload.calc_metric_diff().instruction_ps + g2_bg_cont = group2.background_workload.calc_metric_diff().instruction_ps + current = abs(g1_fg_cont + g1_bg_cont) + abs(g2_fg_cont + g2_bg_cont) + + g1_bg_cont = group1 \ + .background_workload \ + .calc_metric_diff(g2_bg_curr_cores / g1_bg_curr_cores) \ + .instruction_ps + g2_bg_cont = group2 \ + .background_workload \ + .calc_metric_diff(g1_bg_curr_cores / g2_bg_curr_cores) \ + .instruction_ps + future = abs(g1_fg_cont + g2_bg_cont) + abs(g2_fg_cont + g1_bg_cont) + + benefit = current - future + logger.debug(f'Calculating swaption benefit. ' + f'current: {current:>7.4f}, future: {future:>7.4}, benefit: {benefit:>7.4}') + + if benefit > 0.1: logger.debug(f'{group1} and {group2} is selected as swap candidate') return group1, group2 diff --git a/isolating_controller/metric_container/basic_metric.py b/isolating_controller/metric_container/basic_metric.py index a94c924..d4aecef 100644 --- a/isolating_controller/metric_container/basic_metric.py +++ b/isolating_controller/metric_container/basic_metric.py @@ -143,7 +143,7 @@ class MetricDiff: # FIXME: hard coded _MAX_MEM_BANDWIDTH_PS = 68 * 1024 * 1024 * 1024 - def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: + def __init__(self, curr: BasicMetric, prev: BasicMetric, core_norm: float = 1) -> None: self._l3_hit_ratio = curr.l3hit_ratio - prev.l3hit_ratio if curr.local_mem_ps == 0: @@ -155,9 +155,9 @@ def __init__(self, curr: BasicMetric, prev: BasicMetric) -> None: # TODO: is it fair? self._local_mem_ps = -curr.local_mem_ps / self._MAX_MEM_BANDWIDTH_PS else: - self._local_mem_ps = curr.local_mem_ps / prev.local_mem_ps - 1 + self._local_mem_ps = curr.local_mem_ps / (prev.local_mem_ps * core_norm) - 1 - self._instruction_ps = curr.instruction_ps / prev.instruction_ps - 1 + self._instruction_ps = curr.instruction_ps / (prev.instruction_ps * core_norm) - 1 @property def l3_hit_ratio(self) -> float: diff --git a/isolating_controller/workload.py b/isolating_controller/workload.py index b540e6e..8929de5 100644 --- a/isolating_controller/workload.py +++ b/isolating_controller/workload.py @@ -140,9 +140,9 @@ def avg_solorun_data(self) -> Optional[BasicMetric]: def avg_solorun_data(self, new_data: BasicMetric) -> None: self._avg_solorun_data = new_data - def calc_metric_diff(self) -> MetricDiff: + def calc_metric_diff(self, core_norm: float = 1) -> MetricDiff: curr_metric: BasicMetric = self._metrics[0] - return MetricDiff(curr_metric, self._avg_solorun_data) + return MetricDiff(curr_metric, self._avg_solorun_data, core_norm) def all_child_tid(self) -> Tuple[int, ...]: try: