-
Notifications
You must be signed in to change notification settings - Fork 6
/
main.go
242 lines (220 loc) · 8.1 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
/* Copyright(C) 2022-2023. Huawei Technologies Co.,Ltd. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package main implements initialization of the startup parameters of the device plugin.
package main
import (
"context"
"flag"
"fmt"
"os"
"huawei.com/npu-exporter/v5/common-utils/hwlog"
"huawei.com/npu-exporter/v5/devmanager"
"Ascend-device-plugin/pkg/common"
"Ascend-device-plugin/pkg/server"
)
const (
// socket name
defaultLogPath = "/var/log/mindx-dl/devicePlugin/devicePlugin.log"
// defaultListWatchPeriod is the default listening device state's period
defaultListWatchPeriod = 5
// maxListWatchPeriod is the max listening device state's period
maxListWatchPeriod = 60
// minListWatchPeriod is the min listening device state's period
minListWatchPeriod = 3
maxLogLineLength = 1024
// defaultLinkdownTimeout is the default linkdown timeout duration
defaultLinkdownTimeout = 30
// maxLinkdownTimeout is the max linkdown timeout duration
maxLinkdownTimeout = 30
// minLinkdownTimeout is the min linkdown timeout duration
minLinkdownTimeout = 1
)
var (
fdFlag = flag.Bool("fdFlag", false, "Whether to use fd system to manage device (default false)")
useAscendDocker = flag.Bool("useAscendDocker", true, "Whether to use ascend docker. "+
"This parameter will be deprecated in future versions")
volcanoType = flag.Bool("volcanoType", false,
"Specifies whether to use volcano for scheduling when the chip type is Ascend310 or Ascend910 (default false)")
version = flag.Bool("version", false, "Output version information")
edgeLogFile = flag.String("edgeLogFile", "/var/alog/AtlasEdge_log/devicePlugin.log",
"Log file path in edge scene")
listWatchPeriod = flag.Int("listWatchPeriod", defaultListWatchPeriod,
"Listen and watch device state's period, unit second, range [3, 60]")
autoStowing = flag.Bool("autoStowing", true, "Whether to automatically stow the fixed device")
logLevel = flag.Int("logLevel", 0,
"Log level, -1-debug, 0-info, 1-warning, 2-error, 3-critical(default 0)")
logMaxAge = flag.Int("maxAge", common.MaxAge,
"Maximum number of days for backup run log files, range [7, 700] days")
logFile = flag.String("logFile", defaultLogPath,
"The log file path, if the file size exceeds 20MB, will be rotate")
logMaxBackups = flag.Int("maxBackups", common.MaxBackups,
"Maximum number of backup log files, range is (0, 30]")
presetVirtualDevice = flag.Bool("presetVirtualDevice", true, "Open the static of "+
"computing power splitting function, only support Ascend910 and Ascend310P")
use310PMixedInsert = flag.Bool("use310PMixedInsert", false, "Whether to use mixed insert "+
"ascend310P-V, ascend310P-VPro, ascend310P-IPro card mode")
hotReset = flag.Int("hotReset", -1, "set hot reset mode: -1-close, 0-infer, 1-train")
shareDevCount = flag.Uint("shareDevCount", 1, "share device function, enable the func by setting "+
"a value greater than 1, range is [1, 100], only support 310B")
linkdownTimeout = flag.Int64("linkdownTimeout", defaultLinkdownTimeout, "linkdown timeout duration, "+
", range [1, 30]")
)
var (
// BuildName show app name
BuildName string
// BuildVersion show app version
BuildVersion string
// BuildScene show app staring scene
BuildScene string
)
func initLogModule(ctx context.Context) error {
var loggerPath string
loggerPath = *logFile
if *fdFlag {
loggerPath = *edgeLogFile
}
if !common.CheckFileUserSameWithProcess(loggerPath) {
return fmt.Errorf("check log file failed")
}
hwLogConfig := hwlog.LogConfig{
LogFileName: loggerPath,
LogLevel: *logLevel,
MaxBackups: *logMaxBackups,
MaxAge: *logMaxAge,
MaxLineLength: maxLogLineLength,
}
if err := hwlog.InitRunLogger(&hwLogConfig, ctx); err != nil {
fmt.Printf("hwlog init failed, error is %v\n", err)
return err
}
return nil
}
func checkParam() bool {
if *listWatchPeriod < minListWatchPeriod || *listWatchPeriod > maxListWatchPeriod {
hwlog.RunLog.Errorf("list and watch period %d out of range", *listWatchPeriod)
return false
}
if !(*presetVirtualDevice) && !(*volcanoType) {
hwlog.RunLog.Error("presetVirtualDevice is false, volcanoType should be true")
return false
}
if *use310PMixedInsert && *volcanoType {
hwlog.RunLog.Error("use310PMixedInsert is true, volcanoType should be false")
return false
}
if *use310PMixedInsert && *shareDevCount > 1 {
hwlog.RunLog.Error("use310PMixedInsert is true, shareDevCount should be 1")
return false
}
if !(*presetVirtualDevice) && *shareDevCount > 1 {
hwlog.RunLog.Error("presetVirtualDevice is false, shareDevCount should be 1")
return false
}
if *volcanoType && *shareDevCount > 1 {
hwlog.RunLog.Error("volcanoType is true, shareDevCount should be 1")
return false
}
switch *hotReset {
case common.HotResetClose, common.HotResetInfer, common.HotResetTrain:
default:
hwlog.RunLog.Error("hot reset mode param invalid")
return false
}
if BuildScene != common.EdgeScene && BuildScene != common.CenterScene {
hwlog.RunLog.Error("unSupport build scene, only support edge and center")
return false
}
if (*linkdownTimeout) < minLinkdownTimeout || (*linkdownTimeout) > maxLinkdownTimeout {
hwlog.RunLog.Warn("linkdown timeout duration out of range")
return false
}
return checkShareDevCount()
}
func checkShareDevCount() bool {
if *shareDevCount < 1 || *shareDevCount > common.MaxShareDevCount {
hwlog.RunLog.Error("share device function params invalid")
return false
}
return true
}
func main() {
flag.Parse()
if *version {
fmt.Printf("%s version: %s\n", BuildName, BuildVersion)
return
}
ctx, cancel := context.WithCancel(context.Background())
if err := initLogModule(ctx); err != nil {
return
}
if !checkParam() {
return
}
hwlog.RunLog.Infof("ascend device plugin starting and the version is %s", BuildVersion)
hwlog.RunLog.Infof("ascend device plugin starting scene is %s", BuildScene)
setParameters()
hdm, err := InitFunction()
if err != nil {
return
}
setUseAscendDocker()
go hdm.ListenDevice(ctx)
hdm.SignCatch(cancel)
}
// InitFunction init function
func InitFunction() (*server.HwDevManager, error) {
devM, err := devmanager.AutoInit("")
if err != nil {
hwlog.RunLog.Errorf("init devmanager failed, err: %v", err)
return nil, err
}
hdm := server.NewHwDevManager(devM)
if hdm == nil {
hwlog.RunLog.Error("init device manager failed")
return nil, fmt.Errorf("init device manager failed")
}
hwlog.RunLog.Info("init device manager success")
return hdm, nil
}
func setParameters() {
common.ParamOption = common.Option{
GetFdFlag: *fdFlag,
UseAscendDocker: *useAscendDocker,
UseVolcanoType: *volcanoType,
AutoStowingDevs: *autoStowing,
ListAndWatchPeriod: *listWatchPeriod,
PresetVDevice: *presetVirtualDevice,
Use310PMixedInsert: *use310PMixedInsert,
HotReset: *hotReset,
BuildScene: BuildScene,
ShareCount: *shareDevCount,
LinkdownTimeout: *linkdownTimeout,
}
}
func setUseAscendDocker() {
*useAscendDocker = true
ascendDocker := os.Getenv("ASCEND_DOCKER_RUNTIME")
if ascendDocker != "True" {
*useAscendDocker = false
hwlog.RunLog.Debugf("get ASCEND_DOCKER_RUNTIME from env is: %#v", ascendDocker)
}
if common.ParamOption.Use310PMixedInsert {
*useAscendDocker = false
hwlog.RunLog.Debugf("310P mixed insert mode do not use ascend docker")
}
if len(common.ParamOption.ProductTypes) == 1 && common.ParamOption.ProductTypes[0] == common.Atlas200ISoc {
*useAscendDocker = false
hwlog.RunLog.Debugf("your device-type is: %v", common.Atlas200ISoc)
}
common.ParamOption.UseAscendDocker = *useAscendDocker
hwlog.RunLog.Infof("device-plugin set ascend docker as: %v", *useAscendDocker)
}