-
Notifications
You must be signed in to change notification settings - Fork 3
/
ibcheck
executable file
·1835 lines (1559 loc) · 61.6 KB
/
ibcheck
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python2
"""Swiss army knife for Infiniband (IB) troubleshooting.
############################################################################
# #
# Copyright (c) 2010-2012, The Regents of the University of California, #
# through Lawrence Berkeley National Laboratory (subject to receipt of any #
# required approvals from the U.S. Dept. of Energy). All rights reserved. #
# #
# #
# Author: Yong Qin <[email protected]> #
# High Performance Computing Services (http://scs.lbl.gov/) #
# #
############################################################################
############################################################################
# #
# Copyright (c) 2017,2020, Yong Qin. All rights reserved. #
# #
# #
# Author: Yong Qin <[email protected]> #
# #
############################################################################
IBCHECK is a utility that uses the standard tools provided by OFED
infiniband-diags package to perform comprehensive in-band IB troubleshooting.
IBCHECK modules:
1. Topology analyzer.
2. Subnet Manager (SM) scanner.
3. Performance Manager (PM) scanner.
IBCHECK files:
1. Fabric definition file - supplied with "-f" or "-c" option.
Fabric definition file provides a key/value pair definition for a given
GUID/NodeDesc (node description) mapping. It can also be organized as
virtual fabric which is useful when used with "-C" option. E.g.,
[lustre]
0x0005ad00001df29a=ib000.lustre (Cisco SFS7000D, W29-36)
0x0002c9020028ea84=n0000.lustre (mds00)
0x00066a0098009c8a=n0002.lustre (oss00)
0x0005ad00000c6410=n0003.lustre (oss01)
...
2. Topology definition file - supplied with "-t" option.
Topology definition file has the same format as the output from
"ibnetdiscover" command. When this file is provided, IBCHECK will perform
topology comparison (see topology analyzer section for details). The best
practice for creating such file would be during the fabric bring-up stage,
after everything is tested and confirmed in a good working manner. One
can use the following command to generate this file.
$ ibnetdiscover > ibnetdiscover_myfabric.txt
IBCHECK fabric selection:
By default all devices on the fabric are selected to perform an action.
However the following options can also be used to further narrow down the
portion of the fabric to work with.
1. By pre-defined virtual fabric in the fabric file: -c <CONF> -C <FABRIC>
2. By device type: -T <TYPE>
3. By spine index: -S <SPINE>
4. By leaf/line index: -L <LEAF>
5. By NodeDesc: -N <NODEDESC>
6. By GUID: -g <GUID>
7. By LID: -l <LID>
8. By link speed: -s <SPEED>
9 By link width: -w <WIDTH>
One can also use the "-A" option to switch from the default "or" operation
to an "and" operation to further control the selection. In complex environment,
it is recommended to construct a virtual fabric and use "-C" to control the
target selection.
IBCHECK detail level:
Detail level ("-d") option can be stacked and controls the granularity of the
action. "-d" has the same controllability as without providing a detail level,
which presents all devices that are selected (see the fabric selection section).
"-dd" presents all ports with a link on the selected devices. "-ddd" presents
all ports on the selected devices regardless whether there is a link or not.
Behavior for more than 3 levels of details ("-ddd") is not defined.
IBCHECK topology analyzer:
Topology analysis is always performed. Depending on the detail level that
IBCHECK is provided, it shows different level of details of the fabric.
If a topology file is provided, IBCHECK will compare the current physical
fabric with the fabric defined in the topology file to identify differences.
It is useful when troubleshooting large fabric with dead/dropped links or ib
devices, or comparing fabric changes.
Topology analyzer also has a visualization component ("-G") which requires
graphviz and pydot packages to be available. This feature is experimental.
It can also be run in the "dryrun" mode ("-D"), which requires a topology
file (provided with "-t" option). In this mode it does not try to retrieve
the physical fabric topology but perform analysis only on the provided file,
which is useful in offline analysis and visualization.
IBCHECK SM scanner:
SM scanner is activated with "-M" option on selected devices. Note that
a detail level setting controls its behavior as well. If "-d" is used, only
device level scanning is performed, which could lead to less information than
expected, as the device GUID for HCA could be different from its port GUID,
which a host-based SM typically runs on. Thus "-dd" is recommended in all
situations for the SM scanner. "-ddd" leads to scanning of empty ports
without a link, which is not necessary.
IBCHECK PM scanner:
PM scanner ("-E") has two running modes: 1). Monitor mode; 2). Batch mode.
Monitor mode is default and it launches a text-based user interface (TUI)
with an embedded mini help page. One can use the 'h' key to activate it. Keys
'[0-9]' and '[a-f]' can be used to toggle different error/performance counters
to show. Arrow keys and 'n', 'p', ',', '.', PgUp/PgDn can be used to scroll
up/down, and left/right if the display is out of range. 'q' quits from the
Monitor mode. Space bar switches between showing devices/ports with error
counters above the thresholds and all devices/ports (similar to "-a" option).
Batch mode ("-b") performs the same type of PM scan, but it displays results
in a loggable way, which can be used for offline analysis, such as trending
analysis. In Batch mode, all error/performance counters are displayed.
One can also reset the counters on the selected devices/ports. In the monitor
mode, this can be done by pressing the 'r' key; in the batch mode, "-r" achieves
the same goal.
By default PM scanner only displays IB devices with error counters greater
than the preset thresholds. This can be turned off by supplying "-a" option.
Also by default it runs with all available cores on the host. This behavior can
be adjusted by providing "-p <THREADS>" option, which controls how many
background processes to spawn. The default scan interval is 60 seconds, and it
can be changed with "-i <INTERVAL>". In an online monitoring mode, "-i 5" gives
a good refresh rate. However in the batch mode, "-i 300" or greater can be
used as a good practice. Another option "-n <ITERATIONS>" controls how many
times the PM scanner should be run in the batch mode.
Again, "-dd" is recommended for PM scanner module in all cases.
Examples:
1. wwibcheck -h
Display the help page.
2. wwibcheck
Sweep the fabric, display device counts.
3. wwibcheck -d
Sweep the fabric, display all devices found.
4. wwibcheck -dd
Sweep the fabric, display all devices, as well as active ports.
5. wwibcheck -ddd
Sweep the fabric, display all devices and all ports.
6. wwibcheck -N n0000 -d
Display all devices on the fabric with a node description starting with
"n0000".
7. wwibcheck -f fabrics.conf -d
Display all devices, use their designated node description defined in
fabrics.conf
8. wwibcheck -f fabrics.conf -N n0000 -d
Display all devices with a node description starting with "n0000", use
their designated node descriptions.
9. wwibcheck -f fabrics.conf -N n0000.lr -d
Display device which has a node description "n0000.lr" defined in
fabrics.conf
10. wwibcheck -f fabrics.conf -t ibnetdiscover_lr.txt
Sweep the fabric, compare it with the fabric topology defined in
"ibnetdiscover_lr.txt". This helps to identify dead links/devices
immediately.
11. wwibcheck -f fabrics.conf -C lr -d
Sweep the fabric, only show devices defined in virtual fabric "lr".
12. wwibcheck -f fabrics.conf -N ib000.lr,n0000.lr -d
Only display devices "ib000.lr" and "n0000.lr".
13. wwibcheck -f fabrics.conf -g 0x0002c90200431448 -d
Only display device with a GUID "0x0002c90200431448".
14. wwibcheck -f fabrics.conf -l 32 -d
Only display device with a LID "32".
15. wwibcheck -f fabrics.conf -dd -M
Scan and display all subnet managers on the fabric.
16. wwibcheck -f fabrics.conf -dd -E
Activate the Monitor mode of the PM scanner, run with a 60 seconds
interval, until 'q' or 'ESC' is pressed.
17. wwibcheck -f fabrics.conf -dd -C lr -E -i5
Monitor the error counters of the virtual fabric "lr", with a scan
interval of 5 seconds.
18. wwibcheck -f fabrics.conf -dd -C lr -E -b
Activate the Batch mode of the PM scanner on the virtual fabric "lr",
run once and exit.
19. wwibcheck -f fabrics.conf -dd -C lr -E -b -i5 -n3
Run PM scanner in Batch mode, run 3 times with a scan interval of 5
seconds.
20. wwibcheck -f fabrics.conf -dd -C lr -E -r -i5
Reset error counters on the virtual fabric "lr", then start PM scanner
in Monitor mode with a scan interval of 5 seconds.
"""
__author__ = "Yong Qin <[email protected]>"
__date__ = "September 14, 2012"
__version__ = "0.1"
import ConfigParser
import curses
import datetime
import getopt
import os
import pydoc
import Queue
import re
import signal
import subprocess
import sys
import threading
import time
import traceback
# pydot
try:
import pydot
WITH_PYDOT = True
except ImportError:
print >> sys.stderr, 'Warning: Cannot import \'pydot\''
WITH_PYDOT = False
# root
if os.geteuid() == 0:
WITH_ROOT = True
else:
WITH_ROOT = False
# global definitions
# ibnode nodetypes - the main categories that the topology uses
ibnode_nodetype = ['Switch', 'Ca']
# ibnode types - subcategories to organize different device types
ibnode_type = ['Switch', 'Spine', 'Leaf', 'Line', 'Ca']
# different categories that ibnodes will be organized into
collect_list = ['nodedesc', 'desc', 'nodeguid', 'nodelid', 'type', 'portguid', \
'portlid', 'speed', 'width']
# performance counters
ibperf_err1 = ['LinkDowned', 'LinkRecovers', 'RcvErrors', 'SymbolErrors']
ibperf_err2 = ['ExcBufOverrunErrors', 'LinkIntegrityErrors', 'QP1Dropped', \
'RcvConstraintErrors', 'RcvRemotePhysErrors', 'RcvSwRelayErrors', \
'VL15Dropped', 'XmtConstraintErrors', 'XmtDiscards', 'XmtWait']
ibperf_data = ['MulticastRcvPkts', 'MulticastXmitPkts', 'RcvData', 'RcvPkts', \
'UnicastRcvPkts', 'UnicastXmitPkts', 'XmtData', 'XmtPkts']
ibperf_errs = ibperf_err1 + ibperf_err2
ibperf_counters = ibperf_errs + ibperf_data
ibperf_hash = { \
'ExcessiveBufferOverrunErrors' : 'ExcBufOverrunErrors', \
'LinkDownedCounter' : 'LinkDowned', \
'LinkErrorRecoveryCounter' : 'LinkRecovers', \
'LocalLinkIntegrityErrors' : 'LinkIntegrityErrors', \
'PortMulticastRcvPkts' : 'MulticastRcvPkts', \
'PortMulticastXmitPkts' : 'MulticastXmitPkts', \
'PortRcvConstraintErrors' : 'RcvConstraintErrors', \
'PortRcvData' : 'RcvData', \
'PortRcvErrors' : 'RcvErrors', \
'PortRcvPkts' : 'RcvPkts', \
'PortRcvRemotePhysicalErrors' : 'RcvRemotePhysErrors', \
'PortRcvSwitchRelayErrors' : 'RcvSwRelayErrors', \
'PortUnicastRcvPkts' : 'UnicastRcvPkts', \
'PortUnicastXmitPkts' : 'UnicastXmitPkts', \
'PortXmitConstraintErrors' : 'XmtConstraintErrors', \
'PortXmitData' : 'XmtData', \
'PortXmitDiscards' : 'XmtDiscards', \
'PortXmitPkts' : 'XmtPkts', \
'PortXmitWait' : 'XmtWait', \
'QP1Dropped' : 'QP1Dropped', \
'SymbolErrorCounter' : 'SymbolErrors', \
'VL15Dropped' : 'VL15Dropped', \
}
# thresholds for performance counters (ibperf_err1 + ibperf_err2)
ibperf_errs_threshold = {'ExcBufOverrunErrors' : 10,
'LinkDowned' : 10,
'LinkIntegrityErrors' : 10,
'LinkRecovers' : 10,
'RcvConstraintErrors' : 100,
'RcvErrors' : 10,
'RcvRemotePhysErrors' : 100,
'RcvSwRelayErrors' : 100,
'QP1Dropped' : 10,
'SymbolErrors' : 10,
'VL15Dropped' : 100,
'XmtConstraintErrors' : 100,
'XmtDiscards' : 100,
'XmtWait' : 100
}
# functions
def error(message, errcode = -1):
"""
Print the error message and exit with an error code.
"""
print >> sys.stderr, 'Error: %s' % (message)
sys.exit(errcode)
def warning(message):
"""
Print the warning message.
"""
print >> sys.stderr, 'Warning: %s' % (message)
def info(message):
"""
Print the info message.
"""
print >> sys.stderr, 'Info: %s' % (message)
def cpu_number():
"""
Obtain the number of cores/cpus that is available.
"""
try:
cpu = int(os.sysconf('SC_NPROCESSORS_ONLN'))
if cpu > 0:
return cpu
else:
return 0
except (AttributeError, ValueError):
return 0
def run_cmd(cmd):
"""
Run command in subprocess, capture outputs and return.
"""
retvalue = {}
try:
p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, \
stderr = subprocess.STDOUT)
except OSError, exc:
exc.args = ['Can not run \"%s\"']
raise
retvalue['retstr'] = p.stdout.read()
retvalue['retval'] = p.wait()
return retvalue
def assert_file(file):
"""
Assert if file does not exist.
"""
if not os.path.isfile(file):
raise RuntimeError, 'Can not find file \"%s\"' % file
def read_file(file):
"""
Read lines from file.
"""
lines = ''
assert_file(file)
try:
out = open(file)
lines = out.read()
out.close()
except:
raise RuntimeError, 'Failed to read from \"%s\"' % file
return lines
def uniq_list(seq):
"""
Obtain the unique elements from a list.
"""
keys = {}
for each in seq:
keys[each] = 1
return keys.keys()
def parse_config(config_file, clusters, verbose=False):
"""
Parse fabric config file, obtain guid <-> desc (hostname) mapping.
"""
if verbose:
info('Reading config file %s ...' % config_file)
try:
assert_file(config_file)
except RuntimeError:
error('Failed to read from %s' % config_file)
guid_desc_mapping = {}
config = ConfigParser.SafeConfigParser()
config.read(config_file)
config_clusters = config.sections()
# sanity check
if clusters and not set(clusters).issubset(set(config_clusters)):
error('Not all clusters in %s are defined in %s' % (clusters, config_file))
# if no clusters are defined
if not clusters:
clusters = config_clusters
for cluster in clusters:
for guid, desc in config.items(cluster):
guid_desc_mapping[int(guid, 0)] = desc
return guid_desc_mapping
def parse_topology(topo_string, guid_desc_mapping, hca='', dryrun=False, verbose=False):
"""
Parse the topology file which has the ibnetdiscover format, return ibnodes,
indices of ibnodes, and link pairs.
"""
if verbose:
info('Processing topology ...')
ibnodes = {}
ibnodes_index = {}
links = []
# separate it into blocks
block_pattern = re.compile(r'(.*?)\n\n', re.DOTALL)
blocks = [m.groups() for m in block_pattern.finditer(topo_string)]
# parse all blocks to retrieve ibnodes
for block in blocks:
ibnode = parse_ibnode(block[0], guid_desc_mapping, dryrun, verbose=False)
if ibnode:
ibnodes[ibnode['nodeguid']] = ibnode
# put it into ibnodes_index{} as well
for collect in ['desc', 'nodedesc', 'nodeguid', 'nodelid', 'type']:
if collect not in ibnodes_index:
ibnodes_index[collect] = {}
if ibnode[collect] not in ibnodes_index[collect]:
ibnodes_index[collect][ibnode[collect]] = []
if ibnode['nodeguid'] not in ibnodes_index[collect][ibnode[collect]]:
ibnodes_index[collect][ibnode[collect]].append(ibnode['nodeguid'])
for collect in ['portguid', 'portlid', 'speed', 'width']:
if collect not in ibnodes_index:
ibnodes_index[collect] = {}
for ibport in ibnode['nodeports']:
if ibport[collect] not in ibnodes_index[collect]:
ibnodes_index[collect][ibport[collect]] = []
if ibnode['nodeguid'] not in ibnodes_index[collect][ibport[collect]]:
ibnodes_index[collect][ibport[collect]].append(ibnode['nodeguid'])
# sort each sub-category in ibnodes_index['type']
if ibnodes_index:
for type in ibnodes_index['type'].keys():
if type in ['Spine', 'Leaf', 'Line']:
try:
ibnodes_index['type'][type].sort( \
key = lambda guid: int(ibnodes[guid]['typeindex']))
except ValueError:
ibnodes_index['type'][type].sort( \
key = lambda guid: ibnodes[guid]['typeindex'])
else:
ibnodes_index['type'][type].sort( \
key = lambda guid: ibnodes[guid]['desc'])
# parse all ibnodes to construct link graph
for ibnode in ibnodes.values():
for ibport in ibnode['nodeports']:
guid = ibnode['nodeguid']
port = ibport['port']
toguid = ibport['tonodeguid']
toport = ibport['tonodeport']
speed = ibport['speed']
width = ibport['width']
# link[0] = (guid, port)
# link[1] = (toguid, toport)
# link[2] = speed
# link[3] = width
if toguid and ((toguid, toport), (guid, port), speed, width) not in links:
links.append(((guid, port), (toguid, toport), speed, width))
return ibnodes, ibnodes_index, links
def parse_ibnode(block, guid_desc_mapping, hca='', dryrun=False, verbose=False):
"""
Parse an ibnode, categarize if it is a spine, leaf, or ca.
"""
ibnode = {}
if not block:
warning('Empty block')
return ibnode
# match the header part of each block (multiline)
node_pattern = re.compile(r'^vendid=(?P<vendid>.*)\ndevid=(?P<devid>.*)\nsysimgguid=(?P<sysimgguid>.*)\n(switchguid=(?P<switchguid>.*)\(.*\))?(caguid=(?P<caguid>.*))?\n(?P<nodetype>Switch|Ca)\s+(?P<portcount>\d+)\s+\"(?P<nodeguid>.*)\"\s+\#\s+\"(?P<nodedesc>.*)\"(.*\s+port\s+(?P<nodeport>\d+)\s+lid\s+(?P<nodelid>\d+)\s+lmc\s+(?P<nodelmc>\d+))?\n(?P<nodeports>.*)', re.DOTALL)
# match the real type of a switch (Spine|Line|Leaf)
# Mellanox
type_pattern_m = re.compile(r'^(?P<model>MF0.*)\s*\/(?P<type>(S|L))(?P<typeindex>\d+)\/.*$')
# QLogic
type_pattern_q = re.compile(r'^(?P<model>QLogic.*)\s*(?P<type>(S|L))(?P<typeindex>\d+\w*)$')
# Voltaire
type_pattern_v = re.compile(r'^(?P<model>Voltaire.*)\s*(?P<type>(((Spine|Leaf|Line)\s+)))(?P<typeindex>\d+)\s*.*$')
type_pattern = [type_pattern_m, type_pattern_q, type_pattern_v]
node_match = node_pattern.match(block)
if node_match:
for key in node_pattern.groupindex.keys():
ibnode[key] = node_match.group(key)
ibnode['nodedesc'] = ibnode['nodedesc'].strip()
# generalize nodeguid
if ibnode['nodetype'] in ['Switch']:
ibnode['nodeguid'] = ibnode['switchguid']
# check if this ibnode is online or not
if not dryrun:
if hca:
cmd = 'ibaddr -C %s -G %s' % (hca, ibnode['nodeguid'])
else:
cmd = 'ibaddr -G %s' % ibnode['nodeguid']
if not WITH_ROOT:
cmd = 'sudo %s' % cmd
result = run_cmd(cmd)
if result['retval']:
warning('%s found on the fabric but not alive' % ibnode['nodeguid'])
return None
elif ibnode['nodetype'] in ['Ca']:
ibnode['nodeguid'] = ibnode['caguid']
else:
warning('Unknown IB node type %s' % ibnode['nodetype'])
# generalize model, type, and typeindex
if ibnode['nodetype'] in ['Ca']:
ibnode['model'] = None
ibnode['type'] = ibnode['nodetype']
ibnode['typeindex'] = None
elif ibnode['nodetype'] in ['Switch']:
for p in type_pattern:
type_match = p.match(ibnode['nodedesc'])
if type_match:
break
if type_match:
ibnode['model'] = type_match.group('model').strip()
ibnode['type'] = type_match.group('type').strip()
ibnode['typeindex'] = type_match.group('typeindex').strip()
if ibnode['type'] in ['Spine', 'S']:
ibnode['type'] = 'Spine'
elif ibnode['type'] in ['Leaf', 'Line', 'L']:
ibnode['type'] = 'Leaf'
else:
ibnode['model'] = ibnode['nodedesc']
ibnode['type'] = ibnode['nodetype']
ibnode['typeindex'] = None
else:
ibnode['model'] = None
ibnode['type'] = 'Unknown'
ibnode['typeindex'] = None
# digitize fields
for key in ['vendid', 'devid', 'sysimgguid', 'switchguid', 'caguid', \
'nodeguid', 'nodelid', 'nodelmc', 'portcount', 'nodeport']:
if ibnode[key]:
ibnode[key] = int(ibnode[key], 0)
# if guid_desc_mapping is defined, use it to define ibnode['desc'], \
# otherwise use the original ibnode['nodedesc']
if (ibnode['nodeguid'] in guid_desc_mapping):
ibnode['desc'] = guid_desc_mapping[ibnode['nodeguid']]
else:
ibnode['desc'] = ibnode['nodedesc']
# parse nodeports, complete empty ports
ibnode['nodeports'] = parse_ports(ibnode, guid_desc_mapping, verbose)
else:
if verbose:
warning('Cannot parse the following section:\n%s' % block)
return ibnode
def parse_ports(ibnode, guid_desc_mapping, verbose=False):
"""
Parse ports, duplicate ports into ibnodes if ports are available and have
different GUIDs than nodeguid.
"""
ibports = []
if not ibnode['nodeports']:
warning('Empty block')
return ibports
# RE for ibport
port_pattern = re.compile(r'\[(?P<port>\d+)\](\((?P<portguid>.*)\))?\s+\"(?P<tonodeguid>.*)\"\[(?P<tonodeport>\d+)\](\((?P<toportguid>.*)\))?\s+\#\s+(lid\s+(?P<portlid>\d+)\s+lmc\s+(?P<portlmc>\d+)\s+)?\"(?P<tonodedesc>.*)\"\s+lid\s+(?P<toportlid>\d+)\s+(?P<width>\d+)x(?P<speed>(SDR|DDR|QDR|FDR|EDR|HDR|NDR|XDR))$')
ports = ibnode['nodeports'].split('\n')
for port in ports:
ibport = {}
port_match = port_pattern.match(port)
if port_match:
for key in port_pattern.groupindex.keys():
ibport[key] = port_match.group(key)
ibport['tonodedesc'] = ibport['tonodedesc'].strip()
# generalize data
if ibport['portguid']:
ibport['portguid'] = '0x' + ibport['portguid']
if ibport['tonodeguid']:
ibport['tonodeguid'] = '0x' + ibport['tonodeguid'][2:]
if ibport['toportguid']:
ibport['toportguid'] = '0x' + ibport['toportguid']
if ibport['tonodeguid'] and not ibport['toportguid']:
ibport['toportguid'] = ibport['tonodeguid']
for key in ['guid', 'lid', 'lmc']:
if not ibport['port%s' % key]:
ibport['port%s' % key] = ibnode['node%s' % key]
for key in ['port', 'portguid', 'tonodeguid', 'tonodeport', \
'toportguid', 'portlid', 'portlmc', 'toportlid', 'width']:
if key and (type(ibport[key]) is type('')):
ibport[key] = int(ibport[key], 0)
# ibport['todesc'] is the redefined tonodedesc
if ibport['tonodeguid'] in guid_desc_mapping:
ibport['todesc'] = guid_desc_mapping[ibport['tonodeguid']]
else:
ibport['todesc'] = ibport['tonodedesc']
ibports.append(ibport)
else:
if verbose:
warning('Cannot parse the following section:\n%s' % port)
# complete ports that do not have a link
for port in range(1, ibnode['portcount']+1):
if port not in [ibport['port'] for ibport in ibports]:
ibport = {}
for key in port_pattern.groupindex.keys():
if key in ['port']:
ibport[key] = port
elif key in ['portguid']:
if ibnode['nodetype'] in ['Ca']:
ibport[key] = ibnode['nodeguid'] + port
else:
ibport[key] = ibnode['nodeguid']
elif key in ['portlid']:
if ibnode['nodetype'] in ['Switch']:
ibport[key] = ibnode['nodelid']
else:
ibport[key] = None
elif key in ['portlmc']:
if ibnode['nodetype'] in ['Switch']:
ibport[key] = ibnode['nodelmc']
else:
ibport[key] = None
else:
ibport[key] = None
ibports.append(ibport)
# sort all ports based on their port number
if ibports:
ibports.sort(key = lambda ibport: ibport['port'])
return ibports
def query_ibnode(ibnodes_index, dest_list, cat_list, re_string, verbose=False):
"""
Query ibnodes_index to search for all that matches dest_list, based on
categories provided in cat_list, and the regex defined in re_string.
"""
if verbose:
info('Querying %s ...' % dest_list)
tgt_guid = []
for dest in dest_list:
for cat in cat_list:
for key in [x for x in ibnodes_index[cat].keys() if x is not None]:
if ((cat in ['desc', 'nodedesc', 'speed', 'type']) and \
(re.search(re_string % dest, key, re.I))) or \
((cat in ['nodeguid', 'nodelid', 'portguid', 'portlid', 'width']) \
and (dest == key)):
for guid in ibnodes_index[cat][key]:
tgt_guid.append(guid)
return tgt_guid
def display_ibnode(ibnode, detail, verbose=False):
"""
Format and display an ibnode structure.
"""
line = ''
if detail >= 3:
line += ' vendid=0x%x\n devid=0x%x\n sysimgguid=0x%016x\n nodeguid=0x%016x\n' % \
(ibnode['vendid'], ibnode['devid'], ibnode['sysimgguid'], \
ibnode['nodeguid'])
if detail >= 1:
if ibnode['nodetype'] in ['Switch']:
line += ' %-6s %2d ports "0x%016x" # "%s" port %s lid %s lmc %s\n' % \
(ibnode['type'], ibnode['portcount'], ibnode['nodeguid'], \
ibnode['desc'], ibnode['nodeport'], ibnode['nodelid'], \
ibnode['nodelmc'])
elif ibnode['type'] in ['Ca']:
line += ' %-6s %2d ports "0x%016x" # "%s"\n' % \
(ibnode['type'], ibnode['portcount'], ibnode['nodeguid'], \
ibnode['desc'])
if detail >= 2:
for port in ibnode['nodeports']:
if port['tonodeguid']:
line += ' [%2d](0x%016x)(lid %4d) <-> [%2d](0x%016x)(lid %4d) lmc %1d %dx%s # "%s"\n' % \
(port['port'], port['portguid'], port['portlid'], \
port['tonodeport'], port['tonodeguid'], port['toportlid'], \
port['portlmc'], port['width'], port['speed'], port['todesc'])
else:
if detail >= 3:
line += ' [%2s] <-> None\n' % (port['port'])
return line
def count_components(ibnodes_index, verbose=False):
"""
Count components of the given topology.
"""
line = ''
for type in ibnode_type:
try:
count = len(ibnodes_index['type'][type])
except KeyError:
count = 0
if count:
line += '%d %s ' % (count, type)
if not line:
line = '0 '
return line
def compare_components(ibnodes, ibnodes_index1, ibnodes_index2, detail, defined_first=True, verbose=False):
"""
Compare components from two topologies and identify the difference. If
'defined_first' is True, this is a file to fabric comparison, otherwise it is
a fabric to file comparison.
"""
line = ''
if 'type' in ibnodes_index1:
for type in ibnodes_index1['type'].keys():
try:
delta = set(ibnodes_index1['type'][type]) - \
set(ibnodes_index2['type'][type])
except KeyError:
delta = ibnodes_index1['type'][type]
if delta:
if defined_first:
line += '%d %s found in the topology file but not on the fabric:\n' \
% (len(delta), type)
else:
line += '%d %s found on the fabric but not in the topology file:\n' \
% (len(delta), type)
if type in ['Spine', 'Leaf', 'Line']:
try:
delta = sorted(list(delta), \
key = lambda guid: int(ibnodes[guid]['typeindex']))
except ValueError:
delta = sorted(list(delta), \
key = lambda guid: ibnodes[guid]['typeindex'])
else:
delta = sorted(list(delta), key = lambda guid: ibnodes[guid]['desc'])
for guid in delta:
# only output the 1st level of detail
line += display_ibnode(ibnodes[guid], 1, verbose)
line += '\n'
return line
def compare_links(ibnodes, links1, links2, defined_first=True, verbose=False):
"""
Compare the link graph of two topologies and identify the difference. If
'defined_first' is True, this is a file to fabric comparison, otherwise it is
a fabric to file comparison.
"""
line = ''
delta = []
for link in set(links1) - set(links2):
if (link[1], link[0]) not in links2:
delta.append(link)
if delta:
if defined_first:
line += '%d link found in the topology file but not on the fabric:\n' \
% (len(delta))
else:
line += '%d link found on the fabric but not in the topology file:\n' \
% (len(delta))
for link in delta:
try:
desc0 = ibnodes[link[0][0]]['desc']
except:
desc0 = '0x%x' % link[0][0]
try:
desc1 = ibnodes[link[1][0]]['desc']
except:
desc1 = '0x%x' % link[1][0]
line += ' %s [%s] <-> %s [%s]\n' % (desc0, link[0][1], desc1, link[1][1])
return line
def plot_topology(ibnodes, tgt_guid, links, graph_file, verbose=False):
"""
Plot topology with Graphviz, requires pydot.
"""
if verbose:
info("Plotting topology %s ..." % graph_file)
graph = pydot.Dot(graph_type='graph', rankdir='LR')
for guid in tgt_guid:
if ibnodes[guid]['type'] in ['Switch', 'Spine', 'Leaf', 'Line']:
graph.add_node(pydot.Node(ibnodes[guid]['desc'], shape='box', \
style='filled', color='green'))
for link in links:
if link[0][0] in tgt_guid and link[1][0] in tgt_guid:
endpoint0 = ibnodes[link[0][0]]['desc']
endpoint1 = ibnodes[link[1][0]]['desc']
# "Aggregation Node" has a common description so adding the GUID to
# differentiate it
if ibnodes[link[0][0]]['desc'] in \
['Mellanox Technologies Aggregation Node']:
guid = '0x%016x' % ibnodes[link[0][0]]['nodeguid']
endpoint0 = ibnodes[link[0][0]]['desc'] + '\n' + guid
if ibnodes[link[1][0]]['desc'] in \
['Mellanox Technologies Aggregation Node']:
guid = '0x%016x' % ibnodes[link[0][0]]['nodeguid']
endpoint1 = ibnodes[link[1][0]]['desc'] + '\n' + guid
speed = link[2]
width = link[3]
if speed == 'SDR':
color = 'red'
elif speed == 'DDR':
color = 'orange'
elif speed == 'QDR':
color = 'yellow'
elif speed == 'FDR':
color = 'gold'
elif speed == 'EDR':
color = 'cyan'
elif speed == 'HDR':
color = 'blue'
else:
color = 'green'
graph.add_edge(pydot.Edge(endpoint0, endpoint1, color=color, penwidth=width))
graph.write(graph_file, prog='dot', format='png')
def ibtracert(ibnode1, ibnode2, hca='', verbose=False):
"""
Display routing info between two Node GUIDs.
"""
text_line = ''
# run ibtracert to retrieve routing info
if hca:
cmd = 'ibtracert -C %s' % hca
else:
cmd = 'ibtracert'
if WITH_ROOT:
cmd = cmd + ' -G %s %s'
else:
cmd = 'sudo ' + cmd + ' -G %s %s'
if ibnode1['type'] in ['Ca']:
guids1 = [ibport['portguid'] for ibport in ibnode1['nodeports']]
else:
guids1 = [ibnode1['nodeguid']]
if ibnode2['type'] in ['Ca']:
guids2 = [ibport['portguid'] for ibport in ibnode2['nodeports']]
else:
guids2 = [ibnode2['nodeguid']]
for guid1 in guids1:
for guid2 in guids2:
result = run_cmd(cmd % (guid1, guid2))
if not result['retval']:
for line in result['retstr'].split('\n'):
text_line += ' %s\n' % line
else:
text_line += ' No route from %s to %s\n' % (ibnode1['nodedesc'], \
ibnode2['nodedesc'])
print text_line[:-1]
def sminfo(ibnode, hca='', verbose=False):
"""
Run sminfo externally to collect SM info.
"""
text_line = ''
ret_lines = ''
sminfo = []
if hca:
cmd = 'sminfo -C %s' % hca
else:
cmd = 'sminfo'
if WITH_ROOT:
cmd = cmd +' -G %s'
else:
cmd = 'sudo ' + cmd + ' -G %s'
if ibnode['nodetype'] in ['Ca']:
for ibport in ibnode['nodeports']:
if ibport['tonodeguid']:
result = run_cmd(cmd % ibport['portguid'])
if not result['retval']:
ret_lines += result['retstr']
sminfo.append(parse_sminfo(result['retstr'], ibnode['desc'], verbose))
elif ibnode['nodetype'] in ['Switch']:
result = run_cmd(cmd % ibnode['nodeguid'])
if not result['retval']:
ret_lines += result['retstr']
sminfo.append(parse_sminfo(result['retstr'], ibnode['desc'], verbose))
if verbose:
for line in ret_lines.split('\n'):
text_line += ' %s\n' % line
print text_line[:-1]
return sminfo
def sminfo_threading(input_queue, hca, output_queue):
"""
Collecting SM info in parallel.
"""
for ibnode, verbose in iter(input_queue.get, 'STOP'):
output_queue.put(sminfo(ibnode,hca, verbose))
def parse_sminfo(lines, desc, verbose=False):
"""
Parse sminfo for formatting.
"""
sminfo = {}
pattern = re.compile(r'^sminfo:\ssm\slid\s(?P<lid>\d+)\ssm\sguid\s(?P<guid>\S+),\sactivity\scount\s(?P<activity>\d+)\spriority\s(?P<priority>\d+)\sstate\s(?P<state>\d+)\s(?P<tag>\S+)$')
for line in lines.split('\n'):
match = pattern.match(line)
if match: