forked from Seagate/cortx-motr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
client.h
1904 lines (1810 loc) · 67.4 KB
/
client.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* -*- C -*- */
/*
* Copyright (c) 2016-2020 Seagate Technology LLC and/or its Affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For any questions about this software or licensing,
* please email [email protected] or [email protected].
*
*/
#pragma once
#ifndef __MOTR_CLIENT_H__
#define __MOTR_CLIENT_H__
#include "lib/vec.h"
#include "lib/types.h"
#include "sm/sm.h" /* struct m0_sm */
#include "rpc/rpc_machine.h" /* M0_RPC_DEF_MAX_RPC_MSG_SIZE */
#include "fid/fid.h"
#include "lib/cookie.h"
#include "xcode/xcode_attr.h"
/**
* @defgroup client
*
* Overview
* --------
*
* Examples of Motr applications are:
*
* - Motr file system client (m0t1fs);
*
* - Lustre osd-motr module (part of LOMO);
*
* - Lustre HSM backend (part of Castor-A200);
*
* - SWIFT or S3 backend (part of WOMO);
*
* - Motr-based block device (part of BOMO);
*
* Client interface is divided into the following sub-interfaces:
*
* - access sub-interface, which provides basic abstraction to build storage
* application;
*
* - management sub-interface, which provides methods for Motr cluster
* deployment, configuration, re-configuration and monitoring;
*
* - extension sub-interface, which allows applications to extend Motr
* functionality without modifying the core Motr.
*
* This header describes the access sub-interface of client, simply called
* "client interface" hereafter.
*
* In the following "an application" means a code invoking the client interface
* and "the implementation" refers to the implementation of the said interface.
*
* Client provides the following abstractions:
*
* - object (m0_obj) is an array of fixed-size blocks;
*
* - index (m0_idx) is a key-value store;
*
* - realm (m0_realm) is a spatial and temporal part of system with a
* prescribed access discipline. Objects, indices and operations live in
* realms;
*
* - operation (m0_op) is a process of querying or updating system
* state;
*
* Realms are further sub-divided in:
*
* - transaction (m0__dtx) is a collection of operations atomic in the
* face of failures;
*
* - epoch (m0_epoch) is a collection of operations done by an
* application, which moves the system from one application-consistent
* state to another;
*
* - container (m0_container) is a collection of objects used by a
* particular application or group of applications;
*
* - other types of realms, as can be added in the future.
*
* Object, index and realm are sub-types of entity (m0_entity). Entities
* provide state, interface and behavior shared by all sub-types.
*
* All client entry points, except for m0_op_wait(), are non-blocking.
* To perform a potentially lengthy activity, that might involve network
* communication (for example, read from an object), the client entry point
* (m0_obj_op() in the case of object read), sets up an operation
* (m0_ops) structure containing the parameters of the activity and
* immediately returns to the caller. The caller should explicitly launch a set
* of previously prepared operations by calling m0_op_launch().
* Separation of preparation and launch provides for more efficient network
* communication, where multiple operations are accumulated in the same network
* message.
*
* In-memory structures (m0_{obj,index,realm,...}) correspond to some
* storage entities maintained by the implementation. The implementation does
* not enforce this correspondence. If an application keeps multiple in-memory
* structures for the same storage entity (whether in the same process address
* space or not), it is up to the application to keep the structures coherent
* (the very notion of coherency, obviously, being application-dependent). At
* the one end of the spectrum, an application can employ a fully coherent,
* distributed cache for entities, providing strong consistency guarantees. On
* the other end, an application can tolerate multiple inconsistent views of the
* same storage entities, providing NFSv2-like semantics.
*
* Sub-typing
* ----------
*
* @verbatim
*
* entity (create, delete, open, close, fini) [abstract, no constructor]
* |
* |
* +---- object (init, read, write, alloc, free)
* |
* |
* +---- index (init, get, put, next)
* |
* |
* +---- realm () [abstract, no constructor]
* |
* |
* +---- container (init)
* |
* |
* +---- epoch (init)
* |
* |
* +---- dtx (init)
*
*
* op (init, wait, setup, launch, kick, free, fini)
* [has private sub-types in private.h]
*
* @endverbatim
*
* Identifiers
* -----------
*
* An entity exists in some realm and has a 128-bit identifier, unique within
* the cluster and never re-used. The high 8 bits of an identifier denote the
* entity type. Identifier management is up to the application, except that the
* single identifier M0_UBER_REALM is reserved for the "uber realm",
* representing the root of the realm hierarchy, and within each entity type,
* identifiers less than M0_ID_APP are reserved for the implementation's
* internal use.
*
* @todo A library on top of client for fast scalable identifier allocation will
* be provided as part of Motr.
*
* The implementation is free to reserve some 8-bit combinations for its
* internal use.
*
* @todo an interface to register 8-bit combinations for application use (to
* introduce application-specific "entity-like" things).
*
* Operations
* ----------
*
* An operation structure tracks the state of execution of a request made to the
* implementation.
*
* An operation structure is a state machine going through states described by
* enum m0_op_state:
*
* @verbatim
* (0)
* |
* |
* V
* +---------------INITIALISED
* | |
* | | m0_op_launch()
* V V
* FAILED<-------------LAUNCHED
* ^ |
* | |
* | V
* +----------------EXECUTED---------------->STABLE
* @endverbatim
*
* An operation in INITIALISED, FAILED or STABLE state is called "complete" and
* "outstanding" (or "in-progress") otherwise.
*
* An operation is in INITIALISED state after allocation. In this state, the
* operation processing is not yet started, the application is free to modify
* operation parameters with a call to m0_op_setup() or direct field
* access.
*
* Multiple initialised operation structures can be simultaneously moved to the
* LAUNCHED state, by a call to m0_op_launch(). This call starts actual
* operation processing. No changes to the operation structure are allowed by
* the application after this call is made and until the operation completes.
* To improve caching and utilisation of system resources, the implementation
* is free to delay any operation-related acitivities, such as sending network
* messages, for some time after the operation is launched. The value of
* m0_op::op_linger is an application-provided hint about the absolute
* time by which such delays should be limited.
*
* In case of successful execution, a launched operation structure eventually
* reaches EXECUTED state, meaning that the operation was executed at least in
* the volatile stores of the respective services. When the operation enters
* EXECUTED state, the m0_op_ops::oop_executed() call-back, if provided
* by the application, is invoked. By the time this state is entered, the
* operation return code is in m0_op::op_sm::sm_rc, and all return
* information (object data in case of READ, keys and values in case of GET and
* NEXT) are already placed in the application-supplied buffers.
*
* After an operation has been executed, it can still be lost due to a
* failure. The implementation continues to work toward making the operation
* stable. When this activity successfully terminates, the operation enters the
* STABLE state and the m0_op_ops::oop_stable() call-back is invoked, if
* provided. Once an operation is stable, the implementation guarantees that the
* operation would survive any "allowed failure", where allowed failures include
* at least transient service failures (crash and restart with volatile store
* loss), transient network failures and client failures.
*
* In case of a failure, the operation structure moves into FAILED state, the
* m0_op_ops::oop_failed() call-back is invoked, and no further state
* transitions will ensue.
*
* The implementation is free to add another states to the operation state
* machine.
*
* All operation structures embed "generic operation" m0_op as the first
* member.
*
* The application can track the state of the operation either synchronously, by
* waiting until the operation reaches a particular state (m0_op_wait()),
* or asynchronously by supplying (m0_op_setup()) a call-back to be
* called when the operation reaches a particular state.
*
* Operation structures are either pre-allocated by the application or allocated
* by the appropriate entry points, see the "op" parameter of m0_obj_op()
* for an example. When an operation structure is pre-allocated, the application
* must set m0_op::op_size to the size of the pre-allocated structure
* before passing it to a client entry point. This allows the implementation to
* check that the pre-allocated structure has enough room and return an error
* (-EMSGSIZE) otherwise.
*
* Operation errors are returned through m0_op::op_sm::sm_rc.
*
* Operations, common for all entity types are implemented at the entity level:
* m0_entity_create(), m0_entity_delete(),
* m0_entity_fini().
*
* A typical usage would involve initialisation of a concrete entity (e.g.,
* object), execution of some generic operations and then of some concrete
* operations, for example:
*
* @code
* m0_obj o;
* m0_op *ops[2] = {};
*
* // initialise object in-memory structure.
* m0_obj_init(&o, &container, &id, 0);
*
* // initiate object creation. m0_entity_create() allocated the
* // operation and stores the pointer to it in ops[0].
* m0_entity_create(NULL, &o.ob_entity, &ops[0]);
*
* // initiate write data in the object.
* result = m0_obj_op(&o, M0_OC_WRITE, ..., &ops[1]);
*
* // launch both operations (creation and write)
* m0_op_launch(ops, ARRAY_SIZE(ops));
*
* // wait until creation completes
* result = m0_op_wait(op[0], M0_BITS(M0_OS_STABLE,
* M0_OS_FAILED),
* M0_TIME_NEVER);
* // wait until write completes
* result = m0_op_wait(op[1], M0_BITS(M0_OS_STABLE,
* M0_OS_FAILED),
* M0_TIME_NEVER);
* // finalise the object
* m0_entity_fini(&o.ob_entity);
*
* // free the operations
* m0_op_free(op[0]);
* m0_op_free(op[1]);
* @endcode
*
* Object
* ------
*
* A client object is an array of blocks, which can be read from and written
* onto at the block granularity.
*
* Block size is a power of two bytes and is selected at the object creation
* time.
*
* An object has no traditional application-visible meta-data (in particular, it
* has no size). Instead it has meta-data, called "block attributes" associated
* with each block. Block attributes can be used to store check-sums, version
* numbers, hashes, etc. Because of the huge number of blocks in a typical
* system, the overhead of block attributes book-keeping must be kept at a
* minimum, which puts restrictions on the block attribute access interface
* (@todo to be described).
*
* There are 4 types of object operations, in addition to the common entity
* operations:
*
* - READ: transfer blocks and block attributes from an object to
* application buffers;
*
* - WRITE: transfer blocks and block attributes from application buffers to
* an object;
*
* - ALLOC: pre-allocate certain blocks in an implementation-dependent
* manner. This operation guarantees that consecutive WRITE onto
* pre-allocated blocks will not fail due to lack of storage space;
*
* - FREE: free storage resources used by specified object
* blocks. Consecutive reads from the blocks will return zeroes.
*
* READ and WRITE operations are fully scatter-gather-scatter: data are
* transferred between a sequence of object extents and a sequence of
* application buffers, the only restrictions being:
*
* - total lengths of the extents must be equal to the total size of the
* buffers, and
*
* - extents must be block-size aligned and sizes of extents and buffers
* must be multiples of block-size.
*
* Internally, the implementation stores an object according to the object
* layout (specified at the object creation time). The layout determines
* fault-tolerance and performance related characteristics of the
* object. Examples layouts are:
*
* - network striping with parity de-clustering. This is the default layout,
* it provides a flexible level of fault-tolerance, high availability in
* the face of permanent storage device failures and full utilisation of
* storage resources;
*
* - network striping without parity (raid0). This provides higher space
* utilisation and lower processor costs than parity de-clustering at the
* expense of fault-tolerance;
*
* - network mirroring (raid1). This provides high fault-tolerance and
* availability at the cost of storage space consumption;
*
* - de-dup, compression, encryption.
*
* Index
* -----
*
* A client index is a key-value store.
*
* An index stores records, each record consisting of a key and a value. Keys
* and values within the same index can be of variable size. Keys are ordered by
* the lexicographic ordering of their bit-level representation. Records are
* ordered by the key ordering. Keys are unique within an index.
*
* There are 4 types of index operations:
*
* - GET: given a set of keys, return the matching records from the index;
*
* - PUT: given a set of records, place them in the index, overwriting
* existing records if necessary, inserting new records otherwise;
*
* - DEL: given a set of keys, delete the matching records from the index;
*
* - NEXT: given a set of keys, return the records with the next (in the
* ascending key order) keys from the index.
*
* Indices are stored according to a layout, much like objects.
*
* Realm
* -----
*
* To define what a realm is, consider the entire history of a client storage
* system. In the history, each object and index is represented as a "world
* line" (https://en.wikipedia.org/wiki/World_line), which starts when the
* entity is created and ends when it is deleted. Points on the world line
* correspond to operations that update entity state.
*
* A realm is the union of some continuous portions of different world
* lines. That is, a realm is given by a collection of entities and, for each
* entity in the collection, a start and an end point (operations) on the
* entity world line. A realm can be visualised as a cylinder in the history.
*
* The restriction placed on realms is that each start point in a realm must
* either be the first point in a world line (i.e., the corresponding entity is
* created in the realm) or belong to the same realm, called the parent of the
* realm in question. This arranges realms in a tree.
*
* @note Realms are *not* necessarily disjoint.
*
* A realm can be in the following application-controllable states:
*
* - OPEN: in this state the realm can be extended by executing new
* operations against entities already in the realm or creating new
* entities in the realm;
*
* - CLOSED: in this state the realm can no longer be extended, but it is
* tracked by the system and maintains its identity. Entities in a closed
* realm can be located and read-only operations can be executed on them;
*
* - ABSORBED: in this state the realm is no longer tracked by the
* system. All the operations executed as part of the realm are by now
* stable and absorbed in the parent realm;
*
* - FAILED: an application aborted the realm or the implementation
* unilaterally decided to abort it. The operations executed in the realm
* are undone together with a transitive closure of dependent operations
* (the precise definition being left to the implementation
* discretion). Thus, failure of a realm can lead to cascading failures of
* other realms.
*
* Examples of realms are:
*
* - a container (m0_container) can be thought of as a "place" where
* a particular storage application lives. In a typical scenario, when an
* application is setup on a system, a new container, initially empty,
* will be created for the application. The application can create new
* entities in the container and manipulate them without risk of conflicts
* (e.g., for identifier allocation) with other applications. A container
* can be thought of as a virtualised storage system for an application. A
* container realm is open as long as application needs its persistent
* data. When the application is uninstalled, its realm is deleted;
*
* - a snapshot realm is created with a container as the parent and is
* immediately closed. From now on, the snapshot provides a read-only view
* of container objects at the moment of the snapshot creation. Finally,
* the snapshot is deleted. If a snapshot is not closed immediately, but
* remains open, it is a writeable snapshot (clone)---a separate branch in
* the container's history. A clone is eventually deleted without being
* absorbed in the parent container;
*
* - an epoch (m0_epoch) is a realm capturing part of the
* application's work-flow for resilience. Often an HPC application works
* by interspersing "compute phases", when actual data processing is done,
* with an "IO phase" when a checkpoint of application state is saved on
* the storage system for failure recovery purposes. A client application
* would, instead, keep an open "current" epoch realm, closed at the
* compute-IO phase transition, with the new epoch opened immediately. The
* realm tree for such application would look like
*
* @verbatim
*
* CONTAINER--->E--->E---...->E--->CURRENT
*
* @endverbatim
*
* Where all E epochs are closed and in the process of absorbtion, and all
* earlier epochs already absorbed in the container.
*
* If the application fails, it can restart either from the container or
* from any closed epoch, which are all guaranteed to be consistent, that
* is, reflect storage state at the boundry of a compute phase. The final
* CURRENT epoch is potentially inconsistent after a failure and should be
* deleted.
*
* - a distributed transaction (m0__dtx) is a group of operations,
* which must be atomic w.r.t. to failures.
*
* Ownership
* ---------
*
* client entity structures (realms, objects and indices) are allocated by the
* application. The application may free a structure after completing the
* corresponding finalisation call. The application must ensure that all
* outstanding operations on the entity are complete before finalisation.
*
* An operation structure allocated by the application, must remain allocated
* until the operation is complete. Before a complete operation structure can be
* re-used, it should be finalised by a call to m0_op_fini(). An
* operation structure allocated by the client implementation can be finalised
* and re-used as one allocated by the application, and must be eventually freed
* by the application (by calling m0_op_free()) some time after the
* operation completes.
*
* Data blocks used by scatter-gather-scatter lists and key-value records are
* allocated by the application. For read-only operations (M0_OC_READ,
* M0_IC_GET and M0_IC_NEXT) the application may free the data
* blocks as soon as the operation reaches EXECUTED or FAILED state. For
* updating operations, the data blocks must remain allocated until the
* operation stabilises.
*
* Concurrency
* -----------
*
* The client implementation guarantees that concurrent calls to the same index
* are linearizable.
*
* All other concurrency control, including ordering of reads and writes to a
* client object, and distributed transaction serializability, is up to the
* application.
*
* For documentation links, please refer to this file :
* doc/motr-design-doc-list.rst
*
* @todo entity type structures (to provide constructors, 8-bit identifier tags
* and an ability to register new entity types).
*
* @todo handling of extensible attributes (check-sums, version numbers, etc.),
* which require interaction with the implementation on the service side.
*
* @{
*/
/**
* Operation codes for entity, object and index.
*/
enum m0_entity_opcode {
M0_EO_INVALID, /* 0 */
M0_EO_CREATE, /* 1 */
M0_EO_DELETE, /* 2 */
M0_EO_SYNC, /* 3 */
M0_EO_OPEN, /* 4 */
M0_EO_GETATTR, /* 5 */
M0_EO_SETATTR, /* 6 */
M0_EO_LAYOUT_GET, /* 7 */
M0_EO_LAYOUT_SET, /* 8 */
M0_EO_NR /* 9 */
} M0_XCA_ENUM;
/** Object operation codes. */
enum m0_obj_opcode {
/** Read object data. */
M0_OC_READ = M0_EO_NR + 1, /* 10 */
/** Write object data. */
M0_OC_WRITE, /* 11 */
/** Pre-allocate space. */
M0_OC_ALLOC, /* 12 */
/** De-allocate space, consecutive reads will return 0s. */
M0_OC_FREE, /* 13 */
M0_OC_NR /* 14 */
} M0_XCA_ENUM;
/* Index operation codes. */
enum m0_idx_opcode {
/** Lookup a value with the given key. */
M0_IC_GET = M0_OC_NR + 1, /* 15 */
/** Insert or update the value, given a key. */
M0_IC_PUT, /* 16 */
/** Delete the record, if any, for the given key. */
M0_IC_DEL, /* 17 */
/** Given a key, return the next keys and their values. */
M0_IC_NEXT, /* 18 */
/** Check the given index for existence. */
M0_IC_LOOKUP, /* 19 */
/**
* Given an index id, get the list of next indices.
*
* @note the index ids will be fetched into the keys array
* argument of m0_idx_op().
*/
M0_IC_LIST, /* 20 */
M0_IC_NR /* 21 */
} M0_XCA_ENUM;
/**
* Flags passed to m0_obj_op() to specify object IO operation behaviour.
*/
enum m0_op_obj_flags {
/**
* If a hole is met during read, return zeros instead of error.
* WARNING: this might result in a corrupted data, when the hole was
* caused by some error during write. So it's better to verify it.
*/
M0_OOF_HOLE = 1 << 0,
/**
* Write, alloc and free operations wait for the transaction to become
* persistent before returning.
*/
M0_OOF_SYNC = 1 << 1,
/**
* Last unit(s) of the object. User must specify this flag when reading
* or writing last unit(s) of the object to indicate where the object
* ends. Otherwise, in degraded read mode, libmotr may try to read all
* the missing units of the last parity group, including those which
* are beyond the object end, and may end up with too many errors to be
* able to recover the data. Or, on writing, it may trigger needless
* RMW and result with the wrong parity data after reading the stale
* data units of the existing old object.
*
* Note: this flag is needed, because Motr does not know the size of
* objects it stores. (But the user does know, for example, by storing
* objects' metadata in KV-store.)
*/
M0_OOF_LAST = 1 << 2,
/**
* Use this flag to indicate that the provided extents at indexvec
* for the I/O operation fully span the parity groups, and that RMW
* (read-modify-write) should not happen. Motr will check it, and
* return -EINVAL if it is not so.
*/
M0_OOF_FULL = 1 << 3,
} M0_XCA_ENUM;
/**
* Types of entities supported by client.
*/
enum m0_entity_type {
M0_ET_REALM,
M0_ET_OBJ,
M0_ET_IDX
} M0_XCA_ENUM;
/**
* Flags passed to m0_entitiy_create(), m0_entity_open() to specify
* application's behaviour.
*/
enum m0_entity_flags {
/**
* If motr client application has the capability to store object
* metadata by itself (such as pool version and layout, which can
* be stored by the application at motr distributed index, for example),
* it can use this flag to avoid sending additional metadata fops on
* such object operations as CREATE, OPEN, DELETE, GETATTR and, thus,
* improve its performance.
*
* Before calling m0_entity_create() or m0_entity_open(), application
* is expected to set obj->ob_entity->en_flags |= M0_ENF_META. When
* m0_entity_create() returns, the pool version and layout id will be
* available for the application at obj->ob_attr.oa_pver and
* obj->ob_attr.oa_lid respectively.
*
* For example, create workflow can look like this:
*
* obj->ob_entity.en_flags |= M0_ENF_META;
* m0_entity_create(NULL, &obj->ob_entity, &ops[0]);
* // Save the returned pool version and lid into app_meta_data
* app_meta_data.pver = obj->ob_attr.oa_pver;
* app_meta_data.lid = obj->ob_attr.oa_lid;
*
* And read workflow:
*
* obj->ob_entity.en_flags |= M0_ENF_META;
* // Set the pool version and lid from app_meta_data
* obj->ob_attr.oa_pver = app_meta_data.pver;
* obj->ob_attr.oa_lid = app_meta_data.lid;
* m0_entity_open(NULL, &obj->ob_entity, &ops[0]);
*/
M0_ENF_META = 1 << 0,
/**
* If this flags is set during entity_create() that means application
* do not support update operation.
* XXX: This flag is not in use and will be deleted soon, so please
* remove it from your code. If you need to avoid RMW when writing
* the non-full last parity group, use M0_OOF_LAST flag instead.
*/
M0_ENF_NO_RMW = 1 << 1,
/**
* Note below two flags are for Data Integrity:
* M0_ENF_DI - This flag should be set if application is passing checksum
* into ioo_attr
* M0_ENF_GEN_DI - This flag should be set if application wants Motr to
* generate checksum. Default checksum will be generated using
* this M0_CKSUM_DEFAULT_PI algorithm
* Note: Ideally only one flag should be set for DI, if both is set the Motr
* will give priority to DI generation (M0_ENF_GEN_DI)
*/
/**
* This flag is to indicate that application is passing checkum for the IO.
*/
M0_ENF_DI = 1 << 2,
/* This flag will let Motr generate DI for the IO. */
M0_ENF_GEN_DI = 1 << 3
} M0_XCA_ENUM;
/**
* Generic client operation structure.
*/
struct m0_op {
uint64_t op_magic;
/**
* Operation code.
*
* @see m0_entity_opcode, m0_realm_opcode
* @see m0_obj_opcode, m0_idx_opcode,
*/
unsigned int op_code;
/** Operation result code */
int32_t op_rc;
/** Each op has its own sm group. */
struct m0_sm_group op_sm_group;
/** Operation state machine. */
struct m0_sm op_sm;
/** Application-supplied call-backs. */
const struct m0_op_ops *op_cbs;
/** The entity this operation is on. */
struct m0_entity *op_entity;
/** Caching dead-line. */
m0_time_t op_linger; /* a town in Luxembourg. */
/** Size of the ambient operation structure. */
size_t op_size;
/** Part of a cookie (m0_cookie) used to identify this operation. */
uint64_t op_gen;
/**
* Back pointer to parent op and it is used to form an execution plan
* for a group of ops. An example: an composite layout IO op is divided
* into a few IO ops to sub-objects. Each sub-object IO op has an
* pointer to the composite IO op.
*/
struct m0_op *op_parent;
struct m0_sm_ast op_parent_ast;
/** list of pending transactions. */
struct m0_tl op_pending_tx;
struct m0_mutex op_pending_tx_lock;
/* Operation's private data, can be used as arguments for callbacks.*/
void *op_datum;
uint64_t op_count;
/**
* This flag is set when there is an onging cancel operation.
* There is no refcount in this op. But the op cancelling AST
* needs this op being valid. The op cancelling AST will
* semaphore up when it is done. The m0_op_fini() checks this flag
* and semaphore down on it if needed. This will make sure the op
* is not freed before the op cancel is done.
*/
bool op_cancelling;
struct m0_semaphore op_sema;
/**
* Private field, to be used by internal implementation.
*/
void *op_priv;
struct m0_mutex op_priv_lock;
};
/**
* Operation state, stored in m0_op::op_sm::sm_state.
*/
enum m0_op_state {
M0_OS_UNINITIALISED,
M0_OS_INITIALISED,
M0_OS_LAUNCHED,
M0_OS_EXECUTED,
M0_OS_STABLE,
M0_OS_FAILED,
M0_OS_NR
};
/**
* Common structure shared by objects, indices and realms.
*/
struct m0_entity {
/** Entity type. */
enum m0_entity_type en_type;
/** Globally unique, not re-usable entity identifier. */
struct m0_uint128 en_id;
/** Parent realm, this entity lives in. */
struct m0_realm *en_realm;
/**
* Entity state machine. Used internally by the implementation. For the
* reference, the state diagram is:
*
* @verbatim
* create
* CREATING<--------+
* | |
* | |
* | |
* | |
* +---------->INIT<----------------------CLOSING
* | | | ^
* | | | |
* | | | | close
* | | | |
* DELETING<--------+ +-------->OPENING-------->OPEN
* delete open
* @endverbatim
*
*/
struct m0_sm en_sm;
/** Each entity has its own sm group. */
struct m0_sm_group en_sm_group;
/** list of pending transactions. */
struct m0_tl en_pending_tx;
struct m0_mutex en_pending_tx_lock;
uint32_t en_flags;
};
/**
* Object attributes.
*
* This is supplied by an application when an object is created and returned by
* the implementation when an object is opened.
*/
struct m0_obj_attr {
/** Binary logarithm (bit-shift) of object minimal block size. */
m0_bcount_t oa_bshift;
/** Layout ID for an object. */
uint64_t oa_layout_id;
/**
* The pool this object stores data to. A pool can be selected when
* creating an object by specifying this field. A pool version matching
* the specified pool fid is then chosen for the object. The pool
* version is then stored as one of its attributes in service's backend.
*/
struct m0_fid oa_pool;
/** Pool version fid */
struct m0_fid oa_pver;
/**
* Buffer size for object IO. Set this before m0_obj_init() to generate
* optimal layout id during m0_entity_create().
*/
size_t oa_buf_size;
};
/**
* Layout is of an entity containing information to locate data
* (node, service, device). TODO: rewrite the definition.
*/
enum m0_client_layout_type {
M0_LT_PDCLUST = 0,
M0_LT_COMPOSITE,
M0_LT_CAPTURE,
M0_LT_NR
};
/**
* Object is an array of blocks. Each block has 64-bit index and a block
* attributes.
*/
struct m0_client_layout;
struct m0_obj {
struct m0_entity ob_entity;
struct m0_obj_attr ob_attr;
struct m0_client_layout *ob_layout;
/** Cookie associated with a RM context */
struct m0_cookie ob_cookie;
};
struct m0_client_layout {
struct m0_entity ml_entity;
enum m0_client_layout_type ml_type;
/* Back pointer to the object it belongs to. */
struct m0_obj *ml_obj;
const struct m0_client_layout_ops *ml_ops;
};
/**
* Index attributes.
*
* This is supplied by an application and return by the implementation
* when an index is created.
*
*/
struct m0_idx_attr {
/** DIX pool layout type. Please refer to enum dix_layout_type. */
uint32_t idx_layout_type;
/** DIX pool version. */
struct m0_fid idx_pver;
};
/**
* Index is an ordered key-value store.
*
* A record is a key-value pair. A new record can be inserted in an index,
* record with a given key can be looked for, updated or deleted.
*
* An index can be iterated starting from a given key. Keys are ordered in the
* lexicographical order of their bit-representations.
*
* Entity identifier in_entity::en_id format depends on index service type.
* M0_IDX_MOCK and M0_IDX_CASS services accept arbitrary
* identifiers. M0_IDX_DIX service supports two types of indices:
* - distributed index, which is identified by well-formed motr fid having
* m0_dix_fid_type type.
* - non-distributed index, which is identified by well-formed motr fid having
* m0_cas_index_fid_type type.
*/
struct m0_idx {
struct m0_entity in_entity;
struct m0_idx_attr in_attr;
};
#define M0_COMPOSITE_EXTENT_INF (0xffffffffffffffff)
struct m0_composite_layer_idx_key {
struct m0_uint128 cek_layer_id;
m0_bindex_t cek_off;
};
struct m0_composite_layer_idx_val {
m0_bcount_t cev_len;
};
enum m0_realm_type {
M0_ST_CONTAINER,
M0_ST_EPOCH,
M0_ST_DTX,
M0_ST_NR
};
/**
* Forward declaration: m0_client represents a client instance, a connection
* to a motr cluster.
*
* Defined in motr/client_internal.h
*/
struct m0_client;
/**
* Realm is where entities (including other realms) live.
*
* @see m0_container, m0_epoch, m0__dtx.
*/
struct m0_realm {
struct m0_entity re_entity;
enum m0_realm_type re_type;
struct m0_client *re_instance;
};
/**
* Container is a special type of realm, used to partition storage system among
* applications.
*/
struct m0_container {
struct m0_realm co_realm;
};
/**
* Epoch is a special type of realm, used by an application (or a
* collaborative set of applications) to partition their work in consistent
* portions.
*
* Epoch boundary should be a consistent (from application point of view) state
* of storage. By resuming from a given epoch, applications can implement a
* scalable failure recovery mechanism.
*/
struct m0_epoch {
struct m0_realm ep_realm;
};
/**
* Distributed transaction is a special type of realm, which is a group of
* operations atomic w.r.t. certain failures.
*/
struct m0__dtx {
struct m0_realm dt_realm;
};
/**
* Operation callbacks.
*/
struct m0_op_ops {
void (*oop_executed)(struct m0_op *op);
void (*oop_failed)(struct m0_op *op);
void (*oop_stable) (struct m0_op *op);
};
/**
* m0_config contains configuration parameters to setup an
* client instance.
*/
struct m0_config {
/** oostore mode is set when 'is_oostore' is TRUE. */
bool mc_is_oostore;
/**
* Flag for verify-on-read. Parity is checked when doing
* READ's if this flag is set.
*/
bool mc_is_read_verify;
/**
* Flag to enable/disable addb2 initialization
*/
bool mc_is_addb_init;
/** Local endpoint.*/
const char *mc_local_addr;
/** HA service's endpoint.*/
const char *mc_ha_addr;
/** Process fid for rmservice@client. */
const char *mc_process_fid;
const char *mc_profile;
/**
* The minimum length of the 'tm' receive queue,
* use M0_NET_TM_RECV_QUEUE_DEF_LEN if unsure.
*/
uint32_t mc_tm_recv_queue_min_len;