diff --git a/.ansible-lint b/.ansible-lint
new file mode 100644
index 000000000..973ff29f3
--- /dev/null
+++ b/.ansible-lint
@@ -0,0 +1,5 @@
+ - 'fqcn-builtins'
+ - 'fqcn'
+ - 'name[missing]'
+ - 'name[template]'
diff --git a/.gitignore b/.gitignore
index 53354f0e6..bc44901af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,173 @@
+# complete idea
+# variable resources
+# any log files
+# Byte-compiled / optimized / DLL files
+# C extensions
+# Distribution / packaging
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+# Installer logs
+# Unit test / coverage reports
+# Translations
+# Django stuff:
+# Flask stuff:
+# Scrapy stuff:
+# Sphinx documentation
+# PyBuilder
+# Jupyter Notebook
+# IPython
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+# Celery stuff
+# SageMath parsed files
+# Environments
+# Spyder project settings
+# Rope project settings
+# mkdocs documentation
+# mypy
+# Pyre type checker
+# pytype static type analyzer
+# Cython debug symbols
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# CMake
+# Mongo Explorer plugin
+# File-based project format
+# IntelliJ
+# mpeltonen/sbt-idea plugin
+# JIRA plugin
+# Crashlytics plugin (for Android Studio and IntelliJ)
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 000000000..d3d3a2306
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,619 @@
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+# Load and enable all available extensions. Use --list-extensions to see a list
+# all available extensions.
+# In error mode, messages with a category besides ERROR or FATAL are
+# suppressed, and no reports are done by default. Error mode is compatible with
+# disabling specific errors.
+# Always return a 0 (non-error) status code, even if lint errors are found.
+# This is primarily useful in continuous integration scripts.
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+# Specify a score threshold to be exceeded before program exits with error.
+# Interpret the stdin as a python script, whose filename needs to be passed as
+# the module_or_package argument.
+# Files or directories to be skipped. They should be base names, not paths.
+# Add files or directories matching the regex patterns to the ignore-list. The
+# regex matches against paths and can be in Posix or Windows format.
+# Files or directories matching the regex patterns are skipped. The regex
+# matches against base names, not paths. The default value ignores Emacs file
+# locks
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+# Pickle collected data for later comparisons.
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+# Discover python modules and packages in the file system subtree.
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+# In verbose mode, extra non-checker-related info will be displayed.
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+# Tells whether to display a full report or only the messages.
+# Activate the evaluation score.
+# Only show warnings with the listed confidence levels. Leave empty to show
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+ bad-inline-option,
+ locally-disabled,
+ file-ignored,
+ suppressed-message,
+ useless-suppression,
+ deprecated-pragma,
+ use-symbolic-message-instead,
+ missing-function-docstring,
+ import-error,
+ logging-fstring-interpolation,
+ too-many-arguments,
+ fixme
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+# List of note tags to take in consideration, separated by a comma.
+ XXX,
+# Regular expression of note tags to take in consideration.
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+# Tells whether unused global variables should be treated as a violation.
+# List of names allowed to shadow builtins
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+ _cb
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+# Tells whether we should check for unused import in __init__ files.
+# List of qualified module names which can have objects that can redefine
+# builtins.
+# Warn about protected attribute access inside special methods
+# List of method names used to declare (i.e. assign) instance attributes.
+ __new__,
+ setUp,
+ __post_init__
+# List of member names, which should be excluded from the protected access
+# warning.
+ _fields,
+ _replace,
+ _source,
+ _make
+# List of valid names for the first argument in a class method.
+# List of valid names for the first argument in a metaclass class method.
+# Naming style matching correct argument names.
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style. If left empty, argument names will be checked with the set
+# naming style.
+# Naming style matching correct attribute names.
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
+# style.
+# Bad variable names which should always be refused, separated by a comma.
+ bar,
+ baz,
+ toto,
+ tutu,
+ tata,
+ test,
+ bla,
+ tmp
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+# Naming style matching correct class attribute names.
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
+# Naming style matching correct class constant names.
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style. If left empty, class constant names will be checked with
+# the set naming style.
+# Naming style matching correct class names.
+# Regular expression matching correct class names. Overrides class-naming-
+# style. If left empty, class names will be checked with the set naming style.
+# Naming style matching correct constant names.
+# Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming
+# style.
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+# Naming style matching correct function names.
+# Regular expression matching correct function names. Overrides function-
+# naming-style. If left empty, function names will be checked with the set
+# naming style.
+# Good variable names which should always be accepted, separated by a comma.
+ j,
+ k,
+ f,
+ ex,
+ Run,
+ _
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+# Include a hint for the correct naming format with invalid-name.
+# Naming style matching correct inline iteration names.
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
+# Naming style matching correct method names.
+# Regular expression matching correct method names. Overrides method-naming-
+# style. If left empty, method names will be checked with the set naming style.
+# Naming style matching correct module names.
+# Regular expression matching correct module names. Overrides module-naming-
+# style. If left empty, module names will be checked with the set naming style.
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+# Regular expression matching correct type variable names. If left empty, type
+# variable names will be checked with the set naming style.
+# Naming style matching correct variable names.
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style. If left empty, variable names will be checked with the set
+# naming style.
+# List of modules that can be imported at any level, not just the top level
+# one.
+# Allow wildcard imports from modules that define __all__.
+# Deprecated modules which should not be used, separated by a comma.
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+# Force import order to recognize a module as part of a third party library.
+# Couples of modules and preferred modules, separated by a comma.
+# Comments are removed from the similarity computation
+# Docstrings are removed from the similarity computation
+# Imports are removed from the similarity computation
+# Signatures are removed from the similarity computation
+# Minimum lines number of a similarity.
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+# List of symbolic message names to ignore for Mixin members.
+ not-async-context-manager,
+ not-context-manager,
+ attribute-defined-outside-init
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+# Regex pattern to define which classes are considered mixins.
+# List of decorators that change the signature of a decorated function.
+# Limits count of emitted suggestions for spelling mistakes.
+# Spelling dictionary name. Available dictionaries: none. To make it work,
+# install the 'python-enchant' package.
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+# List of comma separated words that should not be checked.
+# A path to a file that contains the private dictionary; one word per line.
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+# List of regular expressions of class ancestor names to ignore when counting
+# public methods (see R0903)
+# List of qualified class names to ignore when counting class parents (see
+# R0901)
+# Maximum number of arguments for function / method.
+# Maximum number of attributes for a class (see R0902).
+# Maximum number of boolean expressions in an if statement (see R0916).
+# Maximum number of branch for function / method body.
+# Maximum number of locals for function / method body.
+# Maximum number of parents for a class (see R0901).
+# Maximum number of public methods for a class (see R0904).
+# Maximum number of return / yield for function / method body.
+# Maximum number of statements in function / method body.
+# Minimum number of public methods for a class (see R0903).
+# Exceptions that will emit a warning when caught.
+ Exception
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+# Maximum number of nested blocks for function / method body
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )??$
+# Number of spaces of indent required inside a hanging or continued line.
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string=' '
+# Maximum number of characters on a single line.
+# Maximum number of lines in a module.
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
diff --git a/README.md b/README.md
index e2d10c8ec..f48ce822d 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,116 @@
-# BiBiGrid2
+# BiBiGrid
+BiBiGrid is a cloud cluster creation and management framework for OpenStack (and more providers in the future).
-BiBiGrid is a tool for an easy cluster setup inside a cloud environment.
+> **Note**
+> The latest version is currently work in progress. Future changes are likely.
+> Not all features of the previous version are available, but they will come soon.
+> The [previous version](https://github.com/BiBiServ/bibigrid/tree/bibigrid-2.3.1) is still available,
+> but not maintained anymore.
+## Getting Started
+For most users the [Hands-On BiBiGrid Tutorial](https://github.com/deNBI/bibigrid_clum2022)
+is the best entry point.
+However, if you are already quite experienced with *OpenStack* and the previous *BiBiGrid* the following brief explanation
+might be just what you need.
+ Brief, technical BiBiGrid2 overview
+### How to configure a cluster?
+#### Configuration File: bibigrid.yml
+A [template](bibigrid.yml) file is included in the repository ([bibigrid.yml](bibigrid.yml)).
+The cluster configuration file consists of a list of configurations. Every configuration describes the provider specific configuration.
+The first configuration additionally contains all the keys that apply to the entire cluster (roles for example).
+Currently only clusters with one provider are possible, so focus only on the first configuration in the list.
+The configuration template [bibigrid.yml](bibigrid.yml) contains many helpful comments, making completing it easier for you.
+[You need more details?](documentation/markdown/features/configuration.md)
+#### Cloud Specification Data: clouds.yml
+To access the cloud, authentication information is required.
+You can download your `clouds.yaml` from OpenStack.
+Your `clouds.yaml` is to be placed in `~/.config/bibigrid/` and will be loaded by BiBiGrid2 on execution.
+[You need more details?](documentation/markdown/features/cloud_specification_data.md)
+### Quick First Time Usage
+If you haven't used BiBiGrid1 in the past or are unfamiliar with OpenStack, we heavily recommend following the
+[tutorial](https://github.com/deNBI/bibigrid_clum2022) instead.
+#### Preparation
+1. Download (or create) the `clouds.yaml` (and optionally `clouds-public.yaml`) file as described [above](#cloud-specification-data-cloudsyml).
+2. Place the `clouds.yaml` into `~/.config/bibigrid`
+3. Fill the configuration, `bibigrid.yml`, with your specifics. At least you need: A master instance with valid type and image,
+a region, an availability zone, an sshUser (most likely ubuntu) and a subnet.
+You probably also want at least one worker with a valid type, image and count.
+4. If your cloud provider runs post-launch services, you need to set the `waitForServices`
+key appropriately which expects a list of services to wait for.
+5. Create a virtual environment from `bibigrid2/requirements.txt`.
+See [here](https://www.akamai.com/blog/developers/how-building-virtual-python-environment) for more detailed info.
+6. Take a look at [First execution](#first-execution)
+#### First execution
+Before follow the steps described at [Preparation](#preparation).
+After cloning the repository navigate to `bibigrid2`.
+In order to execute BiBiGrid2 source the virtual environment created during [preparation](#preparation).
+Take a look at BiBiGrid2's [Command Line Interface](documentation/markdown/features/CLI.md)
+if you want to explore for yourself.
+A first execution run through could be:
+1. `./bibigrid.sh -i [path-to-bibigrid.yml] -ch`: checks the configuration
+2. `./bibigrid.sh -i 'bibigrid.yml -i [path-to-bibigrid.yml] -c'`: creates the cluster (execute only if check was successful)
+3. Use **BiBiGrid2's create output** to investigate the created cluster further. Especially connecting to the ide might be helpful.
+Otherwise, connect using ssh.
+4. While in ssh try `sinfo` to printing node info
+5. Run `srun -x $(hostname) hostname` to power up a worker and get its hostname.
+6. Run `sinfo` again to see the node powering up. After a while it will be terminated again.
+7. Use the terminate command from **BiBiGrid2's create output** to shut down the cluster again.
+All floating-ips used will be released.
+Great! You've just started and terminated your first cluster using BiBiGrid2!
+### Troubleshooting
+If your cluster doesn't start up, please first make sure your configurations file is valid (`-ch`).
+If it is not, try to modify the configurations file to make it valid. Use `-v` or `-vv` to get a more verbose output,
+so you can find the issue faster. Also double check if you have sufficient permissions to access the project.
+If you can't make your configurations file valid, please contact a developer.
+If that's the case, please contact a developer and/or manually check if your quotas are exceeded.
+Some quotas can currently not be checked by bibigrid.
+**Whenever you contact a developer, please send your logfile along.**
+# Documentation
+If you would like to learn more about BiBiGrid2 please follow a fitting link:
+- [BiBiGrid2 Features](documentation/markdown/bibigrid_feature_list.md)
+- [Software used by BiBiGrid2](documentation/markdown/bibigrid_software_list.md)
+ Differences to BiBiGrid1
+* BiBiGrid2 no longer uses RC- but cloud.yaml-files for cloud-specification data. Environment variables are no longer used (or supported).
+See [Cloud Specification Data](documentation/markdown/features/cloud_specification_data.md).
+* BiBiGrid2 has a largely reworked configurations file, because BiBiGrid2 core supports multiple providers this step was necessary.
+See [Configuration](documentation/markdown/features/configuration.md)
+* BiBiGrid2 currently only implements the provider OpenStack.
+* BiBiGrid2 only starts the master and will dynamically start workers using slurm when they are needed.
+Workers are powered down once they are not used for a longer period.
+* BiBiGrid2 lays the foundation for clusters that are spread over multiple providers, but Hybrid Clouds aren't fully implemented yet.
+# Development
+## Development-Guidelines
+## On implementing concrete providers
+New concrete providers can be implemented very easily. Just copy the `provider.py` file and implement all methods for
+your cloud-provider. Also inherit from the `provider` class. After that add your provider to the providerHandler lists; giving it a associated name for the
+configuration files. By that, your provider is automatically added to BiBiGrid2's tests and regular execution. By testing
+your provider first, you will see whether all provider methods are implemented as expected.
\ No newline at end of file
diff --git a/bibigrid.sh b/bibigrid.sh
new file mode 100755
index 000000000..7739c57ad
--- /dev/null
+++ b/bibigrid.sh
@@ -0,0 +1 @@
+python3 -m bibigrid2.core.startup "$@"
\ No newline at end of file
diff --git a/bibigrid.yml b/bibigrid.yml
new file mode 100644
index 000000000..69f589079
--- /dev/null
+++ b/bibigrid.yml
@@ -0,0 +1,93 @@
+ # See https://cloud.denbi.de/wiki/Tutorials/BiBiGrid/ (after update)
+ # First configuration will be used for general cluster information and must include the master.
+ # All other configurations mustn't include another master, but exactly one vpnWorker instead (keys like master).
+- infrastructure: openstack # former mode. Describes what cloud provider is used (others are not implemented yet)
+ cloud: openstack # name of clouds.yaml cloud-specification key (which is value to top level key clouds)
+ ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself.
+ #sshPublicKeyFiles:
+ # - [key one]
+ ## Volumes and snapshots that will be mounted to master
+ # - [mount one]
+ #nfsShares: # KEY NOT FULLY IMPLEMENTED YET; /vol/spool/ is automatically created as a nfs
+ # - [nfsShare one]
+ ## Ansible (Galaxy) roles can be added for execution # KEY NOT IMPLEMENTED YET
+ #ansibleRoles:
+ # - file: SomeFile
+ # hosts: SomeHosts
+ # name: SomeName
+ # vars: SomeVars
+ # vars_file: SomeVarsFile
+ #ansibleGalaxyRoles: # KEY NOT IMPLEMENTED YET
+ # - hosts: SomeHost
+ # name: SomeName
+ # galaxy: SomeGalaxy
+ # git: SomeGit
+ # url: SomeURL
+ # vars: SomeVars
+ # vars_file: SomeVarsFile
+ ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen).
+ #useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated
+ # Other keys - default False
+ #localFS: True
+ #localDNSlookup: True
+ #zabbix: True
+ #nfs: True
+ #ide: True # Very useful to set on True. Use `./bibigrid.sh -i [path-to-bibigrid.yml] -ide -cid [cluster-id]` to start port forwarding to access the ide.
+ useMasterAsCompute: True # Currently ignored by slurm
+ #waitForServices: # existing service name that runs after an instance is launched. BiBiGrid's playbook will wait until service is "stopped" to avoid issues
+ # - de.NBI_Bielefeld_environment.service # uncomment for cloud site Bielefeld
+ # master configuration
+ masterInstance:
+ type: # existing type/flavor on your cloud. See launch instance>flavor for options
+ image: # existing image on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/images pick an active one. Currently only ubuntu22.04 is supported
+ # worker configuration
+ #workerInstances:
+ # - type: # existing type/flavor on your cloud. See launch instance>flavor for options
+ # image: # same as master
+ # count: # any number of workers you would like to create with set type, image combination
+ # Depends on cloud image
+ sshUser: # for example ubuntu
+ # Depends on cloud site:
+ # Berlin : regionOne
+ # Bielefeld : bielefeld
+ # DKFZ : regionOne
+ # Giessen : RegionOne
+ # Heidelberg : RegionOne
+ # Tuebingen : RegionOne
+ region: Bielefeld
+ # Depends on cloud site:
+ # Berlin : nova
+ # Bielefeld : default
+ # DKFZ : nova
+ # Giessen : nova
+ # Heidelberg : nova
+ # Tuebingen : nova
+ availabilityZone: default
+ # Depends on cloud site and project
+ subnet: # existing subnet on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/
+ # Uncomment if no full DNS service for started instances is available.
+ # Currently, the case in Berlin, DKFZ, Heidelberg and Tuebingen.
+ #localDNSLookup: True
+ #- [next configurations] # KEY NOT IMPLEMENTED YET
diff --git a/bibigrid2/core/actions/check.py b/bibigrid2/core/actions/check.py
new file mode 100644
index 000000000..41797ec18
--- /dev/null
+++ b/bibigrid2/core/actions/check.py
@@ -0,0 +1,20 @@
+Module that acts as a wrapper and uses validateConfiguration to validate given configuration
+import logging
+from bibigrid2.core.utility import validate_configuration
+LOG = logging.getLogger("bibigrid")
+def check(configurations, providers):
+ """
+ Uses validateConfiguration to validate given configuration.
+ :param configurations: list of configurations (dicts)
+ :param providers: list of providers
+ :return:
+ """
+ success = validate_configuration.ValidateConfiguration(configurations, providers).validate()
+ check_result = "succeeded! Cluster is ready to start." if success else "failed!"
+ print(f"Total check {check_result}")
+ LOG.info("Total check returned %s.", success)
+ return 0
diff --git a/bibigrid2/core/actions/create.py b/bibigrid2/core/actions/create.py
new file mode 100644
index 000000000..bd24ac3fe
--- /dev/null
+++ b/bibigrid2/core/actions/create.py
@@ -0,0 +1,362 @@
+The cluster creation (master's creation, key creation, ansible setup and execution, ...) is done here
+import logging
+import os
+import subprocess
+import threading
+import traceback
+from functools import partial
+import paramiko
+import yaml
+from bibigrid2.core.actions import terminate_cluster
+from bibigrid2.core.utility import ansible_configurator
+from bibigrid2.core.utility import id_generation
+from bibigrid2.core.utility.handler import ssh_handler
+from bibigrid2.core.utility.paths import ansible_resources_path as aRP
+from bibigrid2.core.utility.paths import bin_path as biRP
+from bibigrid2.models import exceptions
+from bibigrid2.models import return_threading
+from bibigrid2.models.exceptions import ExecutionException
+PREFIX = "bibigrid"
+LOG = logging.getLogger("bibigrid")
+def get_identifier(identifier, cluster_id, worker_group="", additional=""):
+ """
+ This method does more advanced string formatting to generate master, vpnwkr and worker names
+ @param identifier: master|vpnwkr|worker
+ @param cluster_id: id of cluster
+ @param worker_group: group of worker (every member of a group has same flavor/type and image)
+ @param additional: an additional string to be added at the end
+ @return: the generated string
+ """
+ general = PREFIX_WITH_SEP + identifier + str(worker_group) + SEPARATOR + cluster_id
+ if additional:
+ return general + SEPARATOR + str(additional)
+ return general
+MASTER_IDENTIFIER = partial(get_identifier, identifier="master", additional="")
+WORKER_IDENTIFIER = partial(get_identifier, identifier="worker")
+VPN_WORKER_IDENTIFIER = partial(get_identifier, identifier="vpnwkr")
+KEY_PREFIX = "tempKey_bibi"
+KEY_FOLDER = os.path.expanduser("~/.config/bibigrid/keys/")
+AC_NAME = "ac" + SEPARATOR + "{cluster_id}"
+KEY_NAME = KEY_PREFIX + SEPARATOR + "{cluster_id}"
+CLUSTER_MEMORY_FILE = ".bibigrid.mem"
+class Create: # pylint: disable=too-many-instance-attributes,too-many-arguments
+ """
+ The class Create holds necessary methods to execute the Create-Action
+ """
+ def __init__(self, providers, configurations, config_path, debug=False):
+ """
+ Additionally sets (unique) cluster_id, public_key_commands (to copy public keys to master) and key_name.
+ Call create() to actually start server.
+ :param providers: List of providers (provider)
+ :param configurations: List of configurations (dict)
+ :param config_path: string that is the path to config-file
+ :param debug: Bool. If True Cluster will offer shut-down after create and
+ will ask before shutting down on errors
+ """
+ self.providers = providers
+ self.configurations = configurations
+ self.debug = debug
+ self.cluster_id = id_generation.generate_safe_cluster_id(providers)
+ self.ssh_user = configurations[0].get("sshUser") or "ubuntu"
+ self.ssh_add_public_key_commands = ssh_handler.get_add_ssh_public_key_commands(
+ configurations[0].get("sshPublicKeyFiles"))
+ self.config_path = config_path
+ self.master_ip = None
+ LOG.debug("Cluster-ID: %s", self.cluster_id)
+ self.name = AC_NAME.format(cluster_id=self.cluster_id)
+ self.key_name = KEY_NAME.format(cluster_id=self.cluster_id)
+ self.instance_counter = 0
+ self.thread_lock = threading.Lock()
+ self.use_master_with_public_ip = configurations[0].get("useMasterWithPublicIp", True)
+ LOG.debug("Keyname: %s", self.key_name)
+ def generate_keypair(self):
+ """
+ Generates ECDSA Keypair using system-function ssh-keygen and uploads the generated public key to providers.
+ generate_keypair makes use of the fact that files in tmp are automatically deleted
+ ToDo find a more pythonic way to create an ECDSA keypiar
+ See here for why using python module ECDSA wasn't successful
+ https://stackoverflow.com/questions/71194770/why-does-creating-ecdsa-keypairs-via-python-differ-from-ssh-keygen-t-ecdsa-and
+ :return:
+ """
+ # create KEY_FOLDER if it doesn't exist
+ if not os.path.isdir(KEY_FOLDER):
+ LOG.info("%s not found. Creating folder.", KEY_FOLDER)
+ os.mkdir(KEY_FOLDER)
+ # generate keyfile
+ res = subprocess.check_output(f'ssh-keygen -t ecdsa -f {KEY_FOLDER}{self.key_name} -P ""', shell=True).decode()
+ LOG.debug(res)
+ # read private keyfile
+ with open(f"{os.path.join(KEY_FOLDER, self.key_name)}.pub", mode="r", encoding="UTF-8") as key_file:
+ public_key = key_file.read()
+ # upload keyfiles
+ for provider in self.providers:
+ provider.create_keypair(name=self.key_name, public_key=public_key)
+ # write cluster_id to automatically read it on following calls if no cid is given
+ with open(CLUSTER_MEMORY_PATH, mode="w+", encoding="UTF-8") as cluster_memory_file:
+ yaml.safe_dump(data={"cluster_id": self.cluster_id}, stream=cluster_memory_file)
+ def start_instance(self, provider, identifier, instance_type, network, volumes=None,
+ external_network=None):
+ """
+ Starts any (master,worker,vpn) single server/instance in given network on given provider
+ with floating-ip if master or vpn and with volume if master.
+ :param provider: provider server will be started on
+ :param identifier: string MASTER/WORKER/VPN_IDENTIFIER
+ :param instance_type: dict from configuration containing server type, image and count (but count is not needed)
+ :param network: string network where server will be started in.
+ All server of a provider are started in the same network
+ :param volumes: list of volumes that are to be attached to the server. Currently only relevant for master
+ :param external_network: string only needed if worker=False to create floating_ip
+ :return:
+ """
+ # potentially weird counting due to master
+ with self.thread_lock:
+ if identifier == MASTER_IDENTIFIER: # pylint: disable=comparison-with-callable
+ name = identifier(cluster_id=self.cluster_id)
+ elif identifier == WORKER_IDENTIFIER: # pylint: disable=comparison-with-callable
+ name = identifier(number=self.instance_counter, cluster_id=self.cluster_id)
+ # else:
+ # name = identifier(number=self.instance_counter, cluster_id=self.cluster_id)
+ self.instance_counter += 1
+ LOG.info("Starting instance/server %s", name)
+ flavor = instance_type["type"]
+ image = instance_type["image"]
+ server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name,
+ image=image, network=network, volumes=volumes)
+ floating_ip = None
+ # pylint: disable=comparison-with-callable
+ if identifier == VPN_WORKER_IDENTIFIER or (
+ identifier == MASTER_IDENTIFIER and self.use_master_with_public_ip):
+ # wait seems to be included. Not in documentation
+ floating_ip = provider.attach_available_floating_ip(network=external_network,
+ server=server)["floating_ip_address"]
+ elif identifier == MASTER_IDENTIFIER:
+ floating_ip = provider.conn.get_server(server["id"])["private_v4"]
+ # pylint: enable=comparison-with-callable
+ return floating_ip
+ def start_instances(self, configuration, provider):
+ """
+ Starts all instances of a provider using multithreading
+ :param configuration: dict configuration of said provider
+ :param provider: provider
+ :return:
+ """
+ LOG.info("Starting instances on %s", provider.NAME)
+ # threads = []
+ identifier, instance_type, volumes = self.prepare_vpn_or_master_args(configuration, provider)
+ external_network = provider.get_external_network(configuration["network"])
+ # Starts master/vpn. Uses return threading to get floating_ip of master/vpn
+ vpn_or_master_thread = return_threading.ReturnThread(target=self.start_instance,
+ args=[provider,
+ identifier,
+ instance_type,
+ configuration["network"],
+ volumes,
+ external_network])
+ vpn_or_master_thread.start()
+ # Starts all workers
+ # for worker_instance_type in configuration.get("workerInstances") or []:
+ # for worker in range(worker_instance_type["count"]):
+ # worker_thread = threading.Thread(target=self.start_instance,
+ # args=[provider,
+ # worker_instance_type,
+ # configuration["network"],
+ # True])
+ # worker_thread.start()
+ # threads.append(worker_thread)
+ LOG.info("Waiting for servers to start-up on cloud %s", provider.cloud_specification['identifier'])
+ vpn_or_m_floating_ip_address = vpn_or_master_thread.join()
+ self.setup_reachable_servers(configuration, vpn_or_m_floating_ip_address)
+ # for thread in threads:
+ # thread.join()
+ def prepare_vpn_or_master_args(self, configuration, provider):
+ """
+ Prepares start_instance arguments for master/vpn
+ :param configuration: configuration (dict) of said master/vpn
+ :param provider: provider
+ :return: arguments needed by start_instance
+ """
+ if configuration.get("masterInstance"):
+ instance_type = configuration["masterInstance"]
+ identifier = MASTER_IDENTIFIER
+ master_mounts = configuration.get("masterMounts", [])
+ volumes = self.prepare_volumes(provider, master_mounts)
+ elif configuration.get("vpnInstance"):
+ instance_type = configuration["vpnInstance"]
+ volumes = [] # only master has volumes
+ else:
+ LOG.warning("Configuration %s has no vpnwkr or master and is therefore unreachable.", configuration)
+ raise KeyError
+ return identifier, instance_type, volumes
+ def setup_reachable_servers(self, configuration, vpn_or_m_floating_ip_address):
+ """
+ Executes necessary commands on master or vpnwkr
+ :param configuration: said configuration
+ :param vpn_or_m_floating_ip_address: floating_ip to master or vpnwkr
+ """
+ if configuration.get("masterInstance"):
+ self.master_ip = vpn_or_m_floating_ip_address
+ ssh_handler.ansible_preparation(floating_ip=vpn_or_m_floating_ip_address,
+ private_key=KEY_FOLDER + self.key_name,
+ username=self.ssh_user,
+ commands=self.ssh_add_public_key_commands)
+ elif configuration.get("vpnInstance"):
+ ssh_handler.execute_ssh(floating_ip=self.master_ip,
+ private_key=KEY_FOLDER + self.key_name,
+ username=self.ssh_user,
+ commands=ssh_handler.VPN_SETUP)
+ def prepare_volumes(self, provider, mounts):
+ """
+ Creates volumes from snapshots and returns all volumes (pre-existing and newly created)
+ :param provider: provider on which the volumes and snapshots exist
+ :param mounts: volumes or snapshots
+ :return: list of pre-existing and newly created volumes
+ """
+ LOG.info("Preparing volumes")
+ volumes = []
+ for mount in mounts:
+ volume_id = provider.get_volume_by_id_or_name(mount)["id"]
+ if volume_id:
+ volumes.append(volume_id)
+ else:
+ LOG.debug("Volume %s does not exist. Checking for snapshot.", mount)
+ volume_id = provider.create_volume_from_snapshot(mount)
+ if volume_id:
+ volumes.append(volume_id)
+ else:
+ LOG.warning("Mount %s is neither a snapshot nor a volume.", mount)
+ ret_volumes = set(volumes)
+ if len(ret_volumes) < len(volumes):
+ LOG.warning("Identical mounts found in masterMounts list. "
+ "Trying to set() to save the run. Check configurations!")
+ return ret_volumes
+ def prepare_configurations(self):
+ """
+ Makes sure that subnet and network key are set for each configuration.
+ If none is set a keyError will be raised and caught in create.
+ :return:
+ """
+ for configuration, provider in zip(self.configurations, self.providers):
+ if not configuration.get("network"):
+ configuration["network"] = provider.get_network_id_by_subnet(configuration["subnet"])
+ elif not configuration.get("subnet"):
+ configuration["subnet"] = provider.get_subnet_ids_by_network(configuration["network"])
+ configuration["sshUser"] = self.ssh_user # is used in ansibleConfigurator
+ def upload_data(self):
+ """
+ Configures ansible and then uploads the modified files and all necessary data to the master
+ :return:
+ """
+ if not os.path.isdir(aRP.VARS_FOLDER):
+ LOG.info("%s not found. Creating folder.", aRP.VARS_FOLDER)
+ os.mkdir(aRP.VARS_FOLDER)
+ ansible_configurator.configure_ansible_yaml(providers=self.providers,
+ configurations=self.configurations,
+ cluster_id=self.cluster_id)
+ ssh_handler.execute_ssh(floating_ip=self.master_ip, private_key=KEY_FOLDER + self.key_name,
+ username=self.ssh_user,
+ commands=ssh_handler.ANSIBLE_START +
+ [ssh_handler.get_ac_command(self.providers[0], AC_NAME.format(
+ cluster_id=self.cluster_id))])
+ def start_start_instances_threads(self):
+ """
+ Starts for each provider a start_instances thread and joins them.
+ :return:
+ """
+ start_instances_threads = []
+ for configuration, provider in zip(self.configurations, self.providers):
+ start_instances_thread = return_threading.ReturnThread(target=self.start_instances,
+ args=[configuration, provider])
+ start_instances_thread.start()
+ start_instances_threads.append(start_instances_thread)
+ for start_instance_thread in start_instances_threads:
+ start_instance_thread.join()
+ def create(self):
+ """
+ Creates cluster and prints helpful cluster-info afterwards.
+ If debug is set True it offers termination after starting the cluster.
+ :return: exit_state
+ """
+ self.generate_keypair()
+ try:
+ self.prepare_configurations()
+ self.start_start_instances_threads()
+ self.upload_data()
+ self.print_cluster_start_info()
+ if self.debug:
+ LOG.info("DEBUG MODE: Entering termination...")
+ terminate_cluster.terminate_cluster(cluster_id=self.cluster_id, providers=self.providers,
+ debug=self.debug)
+ except exceptions.ConnectionException:
+ LOG.error("Connection couldn't be established. Check Provider connection.")
+ except paramiko.ssh_exception.NoValidConnectionsError:
+ LOG.error("SSH connection couldn't be established. Check keypair.")
+ except KeyError as exc:
+ LOG.error(f"Tried to access dictionary key {str(exc)}, but couldn't. Please check your configurations.")
+ except FileNotFoundError as exc:
+ LOG.error(f"Tried to access resource files but couldn't. No such file or directory: {str(exc)}")
+ except TimeoutError as exc:
+ LOG.error(f"Timeout while connecting to master. Maybe you are trying to create a master without "
+ f"public ip "
+ f"while not being in the same network: {str(exc)}")
+ except ExecutionException as exc:
+ if self.debug:
+ LOG.error(traceback.format_exc())
+ LOG.error(f"Execution of cmd on remote host fails: {str(exc)}")
+ except Exception as exc: # pylint: disable=broad-except
+ if self.debug:
+ LOG.error(traceback.format_exc())
+ LOG.error(f"Unexpected error: '{str(exc)}' ({type(exc)}) Contact a developer!)")
+ else:
+ return 0 # will be called if no exception occurred
+ terminate_cluster.terminate_cluster(cluster_id=self.cluster_id, providers=self.providers, debug=self.debug)
+ return 1
+ def print_cluster_start_info(self):
+ """
+ Prints helpful cluster-info:
+ SSH: How to connect to master via SSH
+ Terminate: What bibigrid2 command is needed to terminate the created cluster
+ Detailed cluster info: How to print detailed info about the created cluster
+ :return:
+ """
+ print(f"Cluster {self.cluster_id} with master {self.master_ip} up and running!")
+ print(f"SSH: ssh -i '{KEY_FOLDER}{self.key_name}' {self.ssh_user}@{self.master_ip}")
+ print(f"Terminate cluster: ./bibigrid.sh -i '{self.config_path}' -t -cid {self.cluster_id}")
+ print(f"Detailed cluster info: ./bibigrid.sh -i '{self.config_path}' -l -cid {self.cluster_id}")
+ if self.configurations[0].get("ide"):
+ print(f"IDE Port Forwarding: ./bibigrid.sh -i '{self.config_path}' -ide -cid {self.cluster_id}")
diff --git a/bibigrid2/core/actions/ide.py b/bibigrid2/core/actions/ide.py
new file mode 100644
index 000000000..d1877a826
--- /dev/null
+++ b/bibigrid2/core/actions/ide.py
@@ -0,0 +1,95 @@
+This module contains methods to establish port forwarding in order to access an ide (theia).
+import logging
+import random
+import re
+import signal
+import subprocess
+import sys
+import time
+import webbrowser
+import sshtunnel
+from bibigrid2.core.utility.handler import cluster_ssh_handler
+MAX_JUMP = 100
+LOG = logging.getLogger("bibigrid")
+def sigint_handler(caught_signal, frame): # pylint: disable=unused-argument
+ """
+ Is called when SIGINT is thrown and terminates the program
+ @param caught_signal:
+ @param frame:
+ @return: 0
+ """
+ print("Exiting...")
+ sys.exit(0)
+signal.signal(signal.SIGINT, sigint_handler)
+def is_used(ip_address):
+ """
+ https://stackoverflow.com/questions/62000168/how-to-check-if-ssh-tunnel-is-being-used
+ :return:
+ """
+ ports_used = []
+ with subprocess.Popen(["netstat", "-na"], stdout=subprocess.PIPE) as process:
+ out = process.stdout.read()
+ lines = out.decode('utf-8').split('\n')
+ for line in lines:
+ is_open = re.match(rf'tcp.*{ip_address}:([0-9][0-9]*).*ESTABLISHED\s*$', line)
+ if is_open is not None:
+ print(line)
+ ports_used.append(is_open[1])
+def ide(cluster_id, master_provider, master_configuration):
+ """
+ Creates a port forwarding from LOCAL_BIND_ADDRESS to REMOTE_BIND_ADDRESS from localhost to master of specified
+ cluster
+ @param cluster_id: cluster_id or ip
+ @param master_provider: master's provider
+ @param master_configuration: master's configuration
+ @return:
+ """
+ LOG.info("Starting port forwarding for ide")
+ master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(cluster_id, master_provider,
+ master_configuration)
+ used_local_bind_address = LOCAL_BIND_ADDRESS
+ if master_ip and ssh_user and used_private_key:
+ attempts = 0
+ while attempts < 16:
+ attempts += 1
+ try:
+ with sshtunnel.SSHTunnelForwarder(
+ ssh_address_or_host=master_ip, # Raspberry Pi in my network
+ ssh_username=ssh_user,
+ ssh_pkey=used_private_key,
+ local_bind_address=(LOCALHOST, used_local_bind_address),
+ remote_bind_address=(LOCALHOST, REMOTE_BIND_ADDRESS)
+ ) as server:
+ print("CTRL+C to close port forwarding when you are done.")
+ with server:
+ # opens in existing window if any default program exists
+ webbrowser.open(f"http://localhost:{used_local_bind_address}", new=2)
+ while True:
+ time.sleep(5)
+ except sshtunnel.HandlerSSHTunnelForwarderError:
+ used_local_bind_address += random.randint(1, MAX_JUMP)
+ LOG.info("Attempt: %s. Port in use... Trying new port %s", attempts, used_local_bind_address)
+ if not master_ip:
+ LOG.warning("Cluster id %s doesn't match an existing cluster with a master.", cluster_id)
+ if not ssh_user:
+ LOG.warning("No ssh user has been specified in the first configuration.")
+ if not used_private_key:
+ LOG.warning("No matching sshPublicKeyFiles can be found in the first configuration or in .bibigrid")
+ return 1
diff --git a/bibigrid2/core/actions/list_clusters.py b/bibigrid2/core/actions/list_clusters.py
new file mode 100644
index 000000000..58f9924ae
--- /dev/null
+++ b/bibigrid2/core/actions/list_clusters.py
@@ -0,0 +1,152 @@
+This module contains methods to list all clusters or a specific cluster in a formatted, readable output.
+This includes a method to create a dictionary containing all running clusters and their servers.
+import logging
+import pprint
+import re
+from bibigrid2.core.actions import create
+SERVER_REGEX = re.compile(r"^bibigrid-((master)-([a-zA-Z0-9]+)|(worker|vpnwkr)\d+-([a-zA-Z0-9]+)-\d+)$")
+LOG = logging.getLogger("bibigrid")
+def dict_clusters(providers):
+ """
+ Creates a dictionary containing all servers by type and provider information
+ :param providers: list of all providers
+ :return: list of all clusters in yaml format
+ """
+ LOG.info("Creating cluster dictionary...")
+ cluster_dict = {}
+ for provider in providers:
+ servers = provider.list_servers()
+ for server in servers:
+ result = SERVER_REGEX.match(server["name"])
+ if result:
+ identifier = result.group(4) or result.group(2)
+ cluster_id = result.group(5) or result.group(3)
+ setup(cluster_dict, cluster_id, server, provider)
+ if identifier == "master":
+ cluster_dict[cluster_id][identifier] = server
+ else:
+ cluster_dict[cluster_id][identifier + "s"].append(server)
+ return cluster_dict # recursively converts munches in cluster_dict to dict
+def setup(cluster_dict, cluster_id, server, provider):
+ """
+ Determines cluster_id.
+ Generates empty entry for cluster_id in cluster_dict.
+ :param server: found server (dict)
+ :param cluster_id: id of said cluster
+ :param cluster_dict: dict containing all found servers by their cluster_id
+ :param provider: server's provider
+ :return: cluster_id
+ """
+ if not cluster_dict.get(cluster_id):
+ cluster_dict[cluster_id] = {}
+ cluster_dict[cluster_id]["workers"] = []
+ cluster_dict[cluster_id]["vpnwkrs"] = []
+ server["provider"] = provider.NAME
+ server["cloud_specification"] = provider.cloud_specification["identifier"]
+def print_list_clusters(cluster_id, providers):
+ """
+ Calls dict_clusters and gives a visual representation of the found cluster.
+ Detail depends on whether a cluster_id is given or not.
+ :param cluster_id:
+ :param providers:
+ :return:
+ """
+ cluster_dict = dict_clusters(providers=providers)
+ if cluster_id: # pylint: disable=too-many-nested-blocks
+ if cluster_dict.get(cluster_id):
+ LOG.info("Printing specific cluster_dictionary")
+ master_count, worker_count, vpn_count = get_size_overview(cluster_dict[cluster_id])
+ print(f"\tCluster has {master_count} master, {vpn_count} vpnwkr and {worker_count} regular workers. "
+ f"The cluster is spread over {vpn_count + master_count} reachable provider(s).")
+ pprint.pprint(cluster_dict[cluster_id])
+ else:
+ LOG.info("Cluster with cluster-id {cluster_id} not found.")
+ print(f"Cluster with cluster-id {cluster_id} not found.")
+ else:
+ LOG.info("Printing overview of cluster all clusters")
+ if cluster_dict:
+ for cluster_key_id, cluster_node_dict in cluster_dict.items():
+ print(f"Cluster-ID: {cluster_key_id}")
+ master = cluster_node_dict.get('master')
+ if master:
+ for key in ["name", "user_id", "launched_at", "key_name", "public_v4", "public_v6", "provider"]:
+ value = cluster_node_dict['master'].get(key)
+ if value:
+ print(f"\t{key}: {value}")
+ security_groups = get_security_groups(cluster_node_dict)
+ print(f"\tsecurity_groups: {security_groups}")
+ networks = get_networks(cluster_node_dict)
+ print(f"\tnetwork: {pprint.pformat(networks)}")
+ else:
+ LOG.warning("No master for cluster: %s.", cluster_key_id)
+ master_count, worker_count, vpn_count = get_size_overview(cluster_node_dict)
+ print(f"\tCluster has {master_count} master, {vpn_count} vpnwkr and {worker_count} regular workers. "
+ f"The cluster is spread over {vpn_count + master_count} reachable provider(s).")
+ else:
+ print("No cluster found.")
+ return 0
+def get_size_overview(cluster_dict):
+ """
+ :param cluster_dict: dictionary of cluster to size_overview
+ :return: number of masters, number of workers, number of vpns
+ """
+ LOG.info("Printing size overview")
+ master_count = int(bool(cluster_dict.get("master")))
+ worker_count = len(cluster_dict.get("workers") or "")
+ vpn_count = len(cluster_dict.get("vpnwkrs") or "")
+ return master_count, worker_count, vpn_count
+def get_networks(cluster_dict):
+ """
+ Gets all addresses of servers
+ :param cluster_dict: dictionary of clusters to find addresses
+ :return: dict containing addresses
+ """
+ master = cluster_dict["master"]
+ addresses = [{master["provider"]: list(master["addresses"].keys())}]
+ for server in (cluster_dict.get("vpnwkrs") or []):
+ addresses.append({server["provider"]: list(server["addresses"].keys())})
+ return addresses
+def get_security_groups(cluster_dict):
+ """
+ Gets all security group of servers
+ :param cluster_dict: dictionary of clusters to find security_groups
+ :return: dict containing security_groups
+ """
+ master = cluster_dict["master"]
+ security_groups = [{master["provider"]: master["security_groups"]}]
+ for server in (cluster_dict.get("vpnwkrs") or []):
+ security_groups.append({server["provider"]: server["security_groups"]})
+ return security_groups
+def get_master_access_ip(cluster_id, master_provider):
+ """
+ Returns master's ip of cluster cluster_id
+ :param master_provider: master's provider
+ :param cluster_id: Id of cluster
+ :return: public ip of master
+ """
+ LOG.info("Finding master ip for cluster %s...", cluster_id)
+ servers = master_provider.list_servers()
+ for server in servers:
+ master = create.MASTER_IDENTIFIER(cluster_id=cluster_id)
+ if server["name"].startswith(master):
+ return server.get("public_v4") or server.get("public_v6") or server.get("private_v4")
+ LOG.warning("Cluster %s not found on master_provider %s.", cluster_id, master_provider)
+ return None
diff --git a/bibigrid2/core/actions/terminate_cluster.py b/bibigrid2/core/actions/terminate_cluster.py
new file mode 100644
index 000000000..67f744dc8
--- /dev/null
+++ b/bibigrid2/core/actions/terminate_cluster.py
@@ -0,0 +1,173 @@
+This module contains methods to terminate a cluster. i.e. to delete all servers, keypairs (local and remote)
+and application credentials used by it.
+import logging
+import os
+import re
+from bibigrid2.core.actions import create
+LOG = logging.getLogger("bibigrid")
+def terminate_cluster(cluster_id, providers, debug=False):
+ """
+ Goes through all providers and gets info of all servers which name contains cluster ID.
+ It then checks if any resources are reserved, but not used and frees them that were hold by the cluster.
+ :param debug if set user gets asked before termination is executed
+ :param providers providers
+ :param cluster_id: ID of cluster to terminate
+ :return: VOID
+ """
+ if debug:
+ if not input(f"DEBUG MODE: Any non-empty input to shutdown cluster {cluster_id}. "
+ "Empty input to exit with cluster still alive:"):
+ return 0
+ cluster_server_state = []
+ cluster_keypair_state = []
+ tmp_keyname = create.KEY_NAME.format(cluster_id=cluster_id)
+ local_keypairs_deleted = delete_local_keypairs(tmp_keyname)
+ if local_keypairs_deleted or input(f"WARNING: No local temporary keyfiles found for cluster {cluster_id}. "
+ f"This might not be your cluster. Are you sure you want to terminate it?\n"
+ f"Any non-empty input to shutdown cluster {cluster_id}. "
+ f"Empty input to exit with cluster still alive:"):
+ for provider in providers:
+ LOG.info("Terminating cluster %s on on cloud %s",
+ cluster_id, provider.cloud_specification['identifier'])
+ server_list = provider.list_servers()
+ cluster_server_state += terminate_servers(server_list, cluster_id, provider)
+ cluster_keypair_state.append(delete_keypairs(provider, tmp_keyname))
+ ac_state = delete_application_credentials(providers[0], cluster_id)
+ terminate_output(cluster_server_state, cluster_keypair_state, ac_state, cluster_id)
+ return 0
+def terminate_servers(server_list, cluster_id, provider):
+ """
+ Terminates all servers in server_list that match the bibigrid regex.
+ @param server_list: list of server dicts. All servers are from provider
+ @param cluster_id: id of cluster to terminate
+ @param provider: provider that holds all servers in server_list
+ @return: a list of the servers' (that were to be terminated) termination states
+ """
+ LOG.info("Deleting servers on provider %s...", provider.cloud_specification['identifier'])
+ cluster_server_state = []
+ # ^(master-{cluster_id}|worker-{cluster_id}|worker-[0-9]+-[0-9]+-{cluster_id})$
+ server_regex = re.compile(fr"^bibigrid-(master-{cluster_id}+|(worker|vpnwkr)\d+-{cluster_id}+-\d+)$")
+ for server in server_list:
+ if server_regex.match(server["name"]):
+ LOG.info("Trying to terminate Server %s on cloud %s.",
+ server['name'], provider.cloud_specification['identifier'])
+ cluster_server_state.append(terminate_server(provider, server))
+ return cluster_server_state
+def terminate_server(provider, server):
+ """
+ Terminates a single server and stores the termination state
+ @param provider: the provider that holds the server
+ @param server: the server that is to be terminated
+ @return: true if the server has been terminated, false else
+ """
+ terminated = provider.delete_server(server["id"])
+ if not terminated:
+ LOG.warning("Unable to terminate server %s on provider %s.",
+ server['name'], provider.cloud_specification['identifier'])
+ else:
+ LOG.info("Server %s terminated on provider %s.",
+ server['name'], provider.cloud_specification['identifier'])
+ return terminated
+def delete_keypairs(provider, tmp_keyname):
+ """
+ Deletes keypairs from all provider
+ @param provider: provider to delete keypair from
+ @param tmp_keyname: BiBiGrid2 keyname
+ @return: True if keypair was deleted
+ """
+ LOG.info("Deleting Keypair on provider %s...", provider.cloud_specification['identifier'])
+ deleted = provider.delete_keypair(tmp_keyname)
+ if deleted:
+ LOG.info("Keypair %s deleted on provider %s.", tmp_keyname, provider.cloud_specification['identifier'])
+ else:
+ LOG.warning("Unable to delete %s on provider %s.", tmp_keyname, provider.cloud_specification['identifier'])
+ return deleted
+def delete_local_keypairs(tmp_keyname):
+ """
+ Deletes local keypairs of a cluster
+ @param tmp_keyname: BiBiGrid2 keyname
+ @return: Returns true if at least one local keyfile (pub or private) was found
+ """
+ success = False
+ LOG.info("Deleting Keypair locally...")
+ tmp_keypath = os.path.join(create.KEY_FOLDER, tmp_keyname)
+ pub_tmp_keypath = tmp_keypath + ".pub"
+ if os.path.isfile(tmp_keypath):
+ os.remove(tmp_keypath)
+ success = True
+ else:
+ LOG.warning(f"Unable to find private keyfile '{tmp_keypath}' locally. No local private keyfile deleted.")
+ if os.path.isfile(pub_tmp_keypath):
+ os.remove(pub_tmp_keypath)
+ success = True
+ else:
+ LOG.warning(f"Unable to find public keyfile '{pub_tmp_keypath}' locally. No local public keyfile deleted.")
+ return success
+def delete_application_credentials(master_provider, cluster_id):
+ """
+ Deletes application credentials from the master_provider
+ @param master_provider: provider that holds the master
+ @param cluster_id:
+ @return: True if no cluster credential remains on the provider. Else False.
+ """
+ # implement deletion
+ auth = master_provider.cloud_specification["auth"]
+ if not auth.get("application_credential_id") or not auth.get("application_credential_secret"):
+ return master_provider.delete_application_credential_by_id_or_name(create.AC_NAME.format(cluster_id=cluster_id))
+ LOG.info("Because you used application credentials to authenticate, "
+ "no created application credentials need deletion.")
+ return True
+def terminate_output(cluster_server_state, cluster_keypair_state, ac_state, cluster_id):
+ """
+ Logs the termination result in detail
+ @param cluster_server_state: list of bools. Each bool stands for a server termination
+ @param cluster_keypair_state: list of bools. Each bool stands for a keypair deletion
+ @param ac_state: bool that stands for the deletion of the credentials on the master
+ @param cluster_id:
+ @return:
+ """
+ cluster_existed = bool(cluster_server_state)
+ cluster_server_terminated = all(cluster_server_state)
+ cluster_keypair_deleted = all(cluster_keypair_state)
+ if cluster_existed:
+ if cluster_server_terminated:
+ LOG.info("Terminated all servers of cluster %s.", cluster_id)
+ else:
+ LOG.warning("Unable to terminate all servers of cluster %s.", cluster_id)
+ if cluster_keypair_deleted:
+ LOG.info("Deleted all keypairs of cluster %s.", cluster_id)
+ else:
+ LOG.warning("Unable to delete all keypairs of cluster %s.", cluster_id)
+ if cluster_server_terminated and cluster_keypair_deleted:
+ out = f"Successfully terminated cluster {cluster_id}."
+ LOG.info(out)
+ print(out)
+ else:
+ LOG.warning("Unable to terminate cluster %s properly."
+ "\nAll servers terminated: %s\nAll keys deleted: %s",
+ cluster_id, cluster_server_terminated, cluster_keypair_deleted)
+ if ac_state:
+ LOG.info("Successfully handled application credential of cluster %s.", cluster_id)
+ else:
+ LOG.warning("Unable to delete application credential of cluster %s", cluster_id)
+ else:
+ LOG.warning("Unable to find any servers for cluster-id %s. "
+ "Check cluster-id and configuration.\nAll keys deleted: %s",
+ cluster_id, cluster_keypair_deleted)
diff --git a/bibigrid2/core/actions/update.py b/bibigrid2/core/actions/update.py
new file mode 100644
index 000000000..091e39300
--- /dev/null
+++ b/bibigrid2/core/actions/update.py
@@ -0,0 +1,27 @@
+Module that contains methods to update the master playbook
+import logging
+from bibigrid2.core.utility import ansible_commands as aC
+from bibigrid2.core.utility.handler import ssh_handler
+from bibigrid2.core.utility.paths import ansible_resources_path as aRP
+from bibigrid2.core.utility.paths import bin_path as biRP
+from bibigrid2.core.utility.handler import cluster_ssh_handler
+LOG = logging.getLogger("bibigrid")
+def update(cluster_id, master_provider, master_configuration):
+ LOG.info("Starting update...")
+ master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(cluster_id, master_provider,
+ master_configuration)
+ if master_ip and ssh_user and used_private_key:
+ LOG.info("Trying to update %s@%s", master_ip, ssh_user)
+ ssh_handler.execute_ssh(floating_ip=master_ip, private_key=used_private_key, username=ssh_user,
+ commands=[aC.EXECUTE],
+ return 0
+ return 1
diff --git a/bibigrid2/core/actions/version.py b/bibigrid2/core/actions/version.py
new file mode 100644
index 000000000..0ddbdb45d
--- /dev/null
+++ b/bibigrid2/core/actions/version.py
@@ -0,0 +1,6 @@
+Contains the static variable __version__ which holds the current version number.
+__version__ = "0.2.0"
diff --git a/bibigrid2/core/provider.py b/bibigrid2/core/provider.py
new file mode 100644
index 000000000..1c50c8bb6
--- /dev/null
+++ b/bibigrid2/core/provider.py
@@ -0,0 +1,210 @@
+Holds the abstract class Provider
+class Provider: # pylint: disable=too-many-public-methods
+ """
+ See in detailed return value information in tests>provider>test_Provider.
+ Make sure to register your newly implemented provider in provider_handler: name:class
+ This will automatically register it for testing when startupTests main is called.
+ """
+ NAME = "Provider"
+ class QuotaExceededException(Exception):
+ """
+ Just a renamed Exception.
+ """
+ def __init__(self, cloud_specification):
+ """
+ Call necessary methods to create a connection and save cloud_specification data as needed.
+ """
+ self.cloud_specification = cloud_specification # contains sensitive information!
+ self.cloud_specification["identifier"] = self.cloud_specification.get('profile') or self.cloud_specification[
+ 'auth'].get('project_id') or self.cloud_specification["auth"].get('application_credential_id') or "Unknown"
+ def create_application_credential(self, name=None):
+ """
+ Creates an application credential with name name
+ :param name: Name of new application credential
+ :return: the application credential dictionary
+ """
+ def delete_application_credential_by_id_or_name(self, ac_id_or_name):
+ """
+ Deletes existing application credential by id or name and returns true.
+ If application credential not found it returns false.
+ :param ac_id_or_name: application credential id or name
+ :return: True if deleted else false
+ """
+ def get_image_by_id_or_name(self, image_id_or_name):
+ """
+ Returns image that has id or name image_id_or_name
+ :param image_id_or_name: identifier
+ :return: said image (dict) or none if not found
+ """
+ def get_flavor(self, instance_type):
+ """
+ Returns flavor that has id or name flavor_id_or_name
+ :param instance_type: identifier
+ :return: said flavor (dict) or none if not found
+ """
+ def get_volume_snapshot_by_id_or_name(self, snapshot_id_or_name):
+ """
+ Returns snapshot that has id or name snapshot_id_or_name
+ :param snapshot_id_or_name: identifier
+ :return: said snapshot (dict) or none if not found
+ """
+ def get_network_by_id_or_name(self, network_id_or_name):
+ """
+ Returns network that has id or name network_id_or_name
+ :param network_id_or_name: identifier
+ :return: said network (dict) or none if not found
+ """
+ def get_subnet_by_id_or_name(self, subnet_id_or_name):
+ """
+ Returns subnet that has id or name subnet_id_or_name
+ :param subnet_id_or_name: identifier
+ :return: said subnet (dict) or none if not found
+ """
+ def list_servers(self):
+ """
+ Returns a list of all servers on logged in provider
+ :return: said list of servers or empty list if none found
+ """
+ def create_server(self, name, flavor, image, network, key_name=None, wait=True, volumes=None): # pylint: disable=too-many-arguments
+ """
+ Creates a new server and waits for it to be accessible if wait=True. If volumes are given, they are attached.
+ Returns said server (dict)
+ :param name: name (str)
+ :param flavor: flavor/type (str)
+ :param image: image/bootable-medium (str)
+ :param network: network (str)
+ :param key_name: (str)
+ :param wait: (bool)
+ :param volumes: List of volumes (list (str))
+ :return: server (dict)
+ """
+ def delete_server(self, name_or_id, delete_ips=True):
+ """
+ Deletes server and floating_ip as well if delete_ips is true. The resource is then free again
+ :param name_or_id:
+ :param delete_ips:
+ :return: True if delete succeeded, False otherwise
+ """
+ def delete_keypair(self, key_name):
+ """
+ Deletes keypair with key_name
+ :param key_name: (str)
+ :return: True if delete succeeded, False otherwise
+ """
+ def get_server_group_by_id_or_name(self, server_group_id_or_name):
+ """
+ Returns server_group that has id or name server_group_id_or_name
+ :param server_group_id_or_name: identifier
+ :return: said server_group (dict) or none if not found
+ """
+ def close(self):
+ """
+ Closes connection
+ :return:
+ """
+ def create_keypair(self, name, public_key):
+ """
+ Creates a new keypair with name name and public_key public_key
+ :param name: name of new keypair
+ :param public_key: public_key of new keypair
+ :return:
+ """
+ def get_network_id_by_subnet(self, subnet):
+ """
+ Gets network_id by subnet
+ :param subnet: id (str)
+ :return: (str)
+ """
+ def get_subnet_ids_by_network(self, network):
+ """
+ Gets subnet_ids (list (str)) by network_id
+ :param network: id (str)
+ :return: subnet_ids (list (str))
+ """
+ def get_free_resources(self):
+ """
+ Gets free resources. If a resource cannot be determined, assume maximum is free.
+ :return: Dictionary containing the free resources
+ """
+ def get_volume_by_id_or_name(self, name_or_id):
+ """
+ Returns volume that has id or name name_or_id
+ :param name_or_id: identifier
+ :return: said volume (dict) or none if not found
+ """
+ def create_volume_from_snapshot(self, snapshot_name_or_id):
+ """
+ Creates a volume from snapshot.
+ :param snapshot_name_or_id: name or id of snapshot
+ :return: id of created volume or none if failed
+ """
+ def get_external_network(self, network_name_or_id):
+ """
+ Finds router interface with network id equal to given network and by that the external network.
+ :param network_name_or_id: Name or id of network
+ :return: Corresponding external network
+ """
+ def add_auto_ip(self, server, wait=False, timeout=60, reuse=True):
+ """
+ Add a floating IP to a server.
+ Will reuse floating ips or create a new one if no floating-ip is down.
+ :param server: the server that said floating ip will be attached to
+ :param wait: wait for floating-ip to be assigned
+ :param timeout: when to accept failing
+ :param reuse: if False will just create a new floating-ip and not reuse an existing down one
+ :return: the floating-ip
+ """
+ def attach_available_floating_ip(self, network=None, server=None):
+ """
+ Get a floating IP from a network or a pool and attach it to the server
+ :param network:
+ :param server:
+ :return:
+ """
+ def get_images(self):
+ """
+ Get a generator able ot generate all images
+ @return: A generator able ot generate all images
+ """
+ def get_flavors(self):
+ """
+ Get a generator able ot generate all flavors
+ @return: A generator able ot generate all flavors
+ """
+ def get_active_images(self):
+ return [image["name"] for image in self.get_images() if image["status"].lower() == "active"]
+ def get_active_flavors(self):
+ return [flavor["name"] for flavor in self.get_flavors()
+ if "legacy" not in flavor["name"].lower() and "deprecated" not in flavor["name"].lower()]
diff --git a/bibigrid2/core/startup.py b/bibigrid2/core/startup.py
new file mode 100755
index 000000000..7973d2ca5
--- /dev/null
+++ b/bibigrid2/core/startup.py
@@ -0,0 +1,139 @@
+Contains main method. Interprets command line, sets logging and starts corresponding action.
+import logging
+import math
+import os
+import sys
+import time
+import traceback
+import yaml
+from bibigrid2.core.actions import check, create, ide, list_clusters, terminate_cluster, update, version
+from bibigrid2.core.utility import command_line_interpreter
+from bibigrid2.core.utility.handler import configuration_handler, provider_handler
+LOGGING_HANDLER_LIST = [logging.StreamHandler(), logging.FileHandler("bibigrid2.log")] # stdout and to file
+VERBOSITY_LIST = [logging.WARNING, logging.INFO, logging.DEBUG]
+LOGGER_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
+LOG = logging.getLogger("bibigrid")
+def get_cluster_id_from_mem():
+ """
+ Reads the cluster_id of the last created cluster and returns it. Used if no cluster_id is given.
+ @return: cluster_id. If no mem file can be found, the file is not a valid yaml file or doesn't contain a cluster_id,
+ it returns none.
+ """
+ if os.path.isfile(create.CLUSTER_MEMORY_PATH):
+ try:
+ with open(create.CLUSTER_MEMORY_PATH, mode="r", encoding="UTF-8") as cluster_memory_file:
+ mem_dict = yaml.safe_load(stream=cluster_memory_file)
+ return mem_dict.get("cluster_id")
+ except yaml.YAMLError as exc:
+ LOG.warning("Couldn't read configuration %s: %s", create.CLUSTER_MEMORY_PATH, exc)
+ return None
+def set_logger(verbosity):
+ """
+ Sets verbosity, format and handler.
+ :param verbosity: level of verbosity
+ :return:
+ """
+ capped_verbosity = min(verbosity, len(VERBOSITY_LIST) - 1)
+ # LOG.basicConfig(format=LOGGER_FORMAT, level=VERBOSITY_LIST[capped_verbosity],
+ logging.basicConfig(format=LOGGER_FORMAT, handlers=LOGGING_HANDLER_LIST)
+ log = logging.getLogger("bibigrid")
+ log.setLevel(VERBOSITY_LIST[capped_verbosity])
+ log.debug(f"Logging verbosity set to {capped_verbosity}")
+def run_action(args, configurations, config_path): # pylint: disable=too-many-nested-blocks,too-many-branches
+ """
+ Uses args to decide which action will be executed and executes said action.
+ :param args: command line arguments
+ :param configurations: list of configurations (dicts)
+ :param config_path: path to configurations-file
+ :return:
+ """
+ if args.version:
+ LOG.info("Action version selected")
+ print(version.__version__)
+ return 0
+ start_time = time.time()
+ exit_state = 0
+ try:
+ providers = provider_handler.get_providers(configurations)
+ if providers:
+ if args.list_clusters:
+ LOG.info("Action list_clusters selected")
+ exit_state = list_clusters.print_list_clusters(args.cluster_id, providers)
+ elif args.check:
+ LOG.info("Action check selected")
+ exit_state = check.check(configurations, providers)
+ elif args.create:
+ LOG.info("Action create selected")
+ creator = create.Create(providers=providers,
+ configurations=configurations,
+ debug=args.debug,
+ config_path=config_path)
+ print("Creating a new cluster takes about 10 or more minutes depending on your cloud provider "
+ "and your configuration. Be patient.")
+ exit_state = creator.create()
+ else:
+ if not args.cluster_id:
+ args.cluster_id = get_cluster_id_from_mem()
+ LOG.info("No cid (cluster_id) specified. Defaulting to last created cluster: %s",
+ args.cluster_id or 'None found')
+ if args.cluster_id:
+ if args.terminate_cluster:
+ LOG.info("Action terminate_cluster selected")
+ exit_state = terminate_cluster.terminate_cluster(args.cluster_id, providers, args.debug)
+ elif args.ide:
+ LOG.info("Action ide selected")
+ exit_state = ide.ide(args.cluster_id, providers[0], configurations[0])
+ elif args.update:
+ LOG.info("Action update selected")
+ exit_state = update.update(args.cluster_id, providers[0], configurations[0])
+ else:
+ LOG.warning("Please make use of -cid .")
+ for provider in providers:
+ provider.close()
+ else:
+ exit_state = 1
+ except Exception as err: # pylint: disable=broad-except
+ if args.debug:
+ traceback.print_exc()
+ else:
+ LOG.error(err)
+ exit_state = 2
+ time_in_s = time.time() - start_time
+ print(f"--- {math.floor(time_in_s / 60)} minutes and {time_in_s % 60} seconds ---")
+ return exit_state
+def main():
+ """
+ Interprets command line, sets logger, reads configuration and runs selected action. Then exits.
+ :return:
+ """
+ args = command_line_interpreter.interpret_command_line()
+ set_logger(args.verbose)
+ configurations = configuration_handler.read_configuration(args.config_input)
+ if configurations:
+ sys.exit(run_action(args, configurations, args.config_input))
+ sys.exit(1)
+if __name__ == "__main__":
+ main()
diff --git a/bibigrid2/core/utility/ansible_commands.py b/bibigrid2/core/utility/ansible_commands.py
new file mode 100644
index 000000000..c84030d87
--- /dev/null
+++ b/bibigrid2/core/utility/ansible_commands.py
@@ -0,0 +1,58 @@
+Module containing a bunch of useful commands to be used by sshHandler.py for cluster setup
+import os
+import bibigrid2.core.utility.paths.ansible_resources_path as aRP
+#TO_LOG = "| sudo tee -a /var/log/ansible.log"
+#AIY = "apt-get -y install"
+#SAU = "sudo apt-get update"
+NO_UPDATE = ("""sudo sed -i 's/APT::Periodic::Unattended-Upgrade "1";/APT::Periodic::Unattended-Upgrade "0";/g' """
+ """/etc/apt/apt.conf.d/20auto-upgrades""", "Disable apt auto update.")
+# Setup (Python for everyone)
+# UPDATE = f"sudo {AU} {TO_LOG}"
+# PIP = f"sudo pip3 install --upgrade pip {TO_LOG}"
+# SETUPTOOLS = "sudo pip3 install setuptools"
+# LOG = "export ANSIBLE_LOG_PATH=~/ansible.log"
+WAIT_READY = ('while sudo lsof /var/lib/dpkg/lock 2> null; do echo "/var/lib/dpkg/lock locked - wait for 10 seconds"; '
+ 'sleep 10; done', "Wait for dpkg lock removed.")
+# SLEEP_10 = "sleep 10s"
+# RANDOM = "sudo DEBIAN_FRONTEND=noninteractive apt-get --yes install apt-transport-https ca-certificates " \
+# "software-properties-common python3 python3-pip libffi-dev libssl-dev"
+# PYTHON_WORKERS = f'ansible workers -i "{aRP.HOSTS_CONFIG_FILE_REMOTE}" --become -m raw -a "{SAU} && {AIY} python3' \
+# f'"'
+# Test Ansible
+# PING = (f'ansible -i "{aRP.HOSTS_CONFIG_FILE_REMOTE}" all -m ping',"Ping all hosts using ansible.")
+# OK = ('if [ $? -eq 0 ]; then echo "Ansible configuration seems to work properly."; '
+# 'else echo"Ansible hosts not reachable. There seems to be a misconfiguration."; fi',"Check for ")
+# Run ansible-galaxy to install ansible-galaxy roles from galaxy, git or url (.tar.gz)
+# GALAXY = f"ansible-galaxy install --roles-path {aRP.ADDITIONAL_ROLES_ROOT_PATH_REMOTE} -r {aRP.REQUIREMENTS_YML}"
+# Extract ansible roles from files (.tar.gz, .tgz)
+# EXTRACT = f"for f in $(find /tmp/roles -type f -regex '.*\\.t\\(ar\\.\\)?gz'); " \
+# f"do tar -xzf $f -C {aRP.ADDITIONAL_ROLES_ROOT_PATH_REMOTE}; done"
+# Fix line endings for all text based ansible file to ensure windows files being used correctly
+# GET_ASCII_FILES = "files=$(for f in $( find ~/playbook -type f); do file ${f} | grep ASCII | cut -f 1 -d ':'; done;)"
+# REPLACE_ENDINGS = "for file in ${file}; do sed -i 's/\\r$//' \"${file}\"; done"
+# Utility
+ADD_PLAYBOOK_TO_LINUX_HOME = ("ln -s /opt/playbook ~/playbook", "Link /opt/playbook to ~/playbook.")
+# Execute
+PLAYBOOK_HOME = ("sudo mkdir -p /opt/playbook", "Create playbook home.")
+PLAYBOOK_HOME_RIGHTS = ("sudo chown ubuntu:ubuntu /opt/playbook", "Adjust playbook home permission.")
+ "sudo install -D /opt/playbook/ansible.cfg /etc/ansible/ansible.cfg", "Move ansible configuration.")
+EXECUTE = (f"ansible-playbook {os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.SITE_YML)} -i "
+ f"{os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.ANSIBLE_HOSTS)} -l master",
+ "Execute ansible playbook. Be patient.")
+# ansible setup
+UPDATE = ("sudo apt-get update", "Update apt repository lists.")
+PYTHON3_PIP = "sudo apt-get install -y python3-pip", "Install python3 pip using apt."
+ANSIBLE_PASSLIB = ("sudo pip install ansible==6.6 passlib", "Install Ansible and Passlib using pip.")
diff --git a/bibigrid2/core/utility/ansible_configurator.py b/bibigrid2/core/utility/ansible_configurator.py
new file mode 100644
index 000000000..0bce57c1e
--- /dev/null
+++ b/bibigrid2/core/utility/ansible_configurator.py
@@ -0,0 +1,305 @@
+Prepares ansible files (vars, common_configuration, ...)
+import logging
+import mergedeep
+import yaml
+from bibigrid2.core.actions import create
+from bibigrid2.core.actions import ide
+from bibigrid2.core.actions import list_clusters
+from bibigrid2.core.utility.handler import configuration_handler
+from bibigrid2.core.utility import id_generation
+from bibigrid2.core.utility.paths import ansible_resources_path as aRP
+from bibigrid2.core.utility import yaml_dumper
+DEFAULT_NFS_SHARES = ["/vol/spool"]
+ADDITIONAL_PATH = "additional/"
+PYTHON_INTERPRETER = "/usr/bin/python3"
+MASTER_ROLES = [{"role": "bibigrid", "tags": ["bibigrid", "bibigrid-master"]}]
+WORKER_ROLES = [{"role": "bibigrid", "tags": ["bibigrid", "bibigrid-worker"]}]
+IDE_CONF = {"ide": False, "workspace": ide.DEFAULT_IDE_WORKSPACE, "port_start": ide.REMOTE_BIND_ADDRESS,
+ "port_end": ide.DEFAULT_IDE_PORT_END, "build": False}
+ZABBIX_CONF = {"db": "zabbix", "db_user": "zabbix", "db_password": "zabbix", "timezone": "Europe/Berlin",
+ "server_name": "bibigrid", "admin_password": "bibigrid"}
+SLURM_CONF = {"db": "slurm", "db_user": "slurm", "db_password": "changeme",
+ "munge_key": id_generation.generate_munge_key(),
+ "elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 900, "TreeWidth": 128}}
+LOG = logging.getLogger("bibigrid")
+def generate_site_file_yaml(custom_roles):
+ """
+ Generates site_yaml (dict).
+ Deepcopy is used in case roles might differ between servers in the future.
+ :param custom_roles: ansibleRoles given by the config
+ :return: site_yaml (dict)
+ """
+ site_yaml = [{'hosts': 'master', "become": "yes",
+ "vars_files": VARS_FILES, "roles": MASTER_ROLES},
+ {"hosts": "workers", "become": "yes", "vars_files": VARS_FILES,
+ "roles": WORKER_ROLES}] # ,
+ # {"hosts": "vpnwkr", "become": "yes", "vars_files": copy.deepcopy(VARS_FILES),
+ # "roles": ["common", "vpnwkr"]}]
+ # add custom roles and vars
+ for custom_role in custom_roles:
+ VARS_FILES.append(custom_role["vars_file"])
+ MASTER_ROLES.append(ADDITIONAL_PATH + custom_role["name"])
+ WORKER_ROLES.append(ADDITIONAL_PATH + custom_role["name"])
+ return site_yaml
+def generate_instances_yaml(cluster_dict, configuration, provider, cluster_id): # pylint: disable=too-many-locals
+ """
+ ToDo filter what information really is necessary. Determined by further development
+ Filters unnecessary information
+ :param cluster_dict: cluster_dict to get the information from
+ :param configuration: configuration of master cloud ToDo needs to be list in the future
+ :param provider: provider of master cloud ToDo needs to be list in the future
+ :param cluster_id: To get proper naming
+ :return: filtered information (dict)
+ """
+ LOG.info("Generating instances file...")
+ workers = []
+ flavor_keys = ["name", "ram", "vcpus", "disk", "ephemeral"]
+ for index, worker in enumerate(configuration.get("workerInstances", [])):
+ flavor = provider.get_flavor(worker["type"])
+ flavor_dict = {key: flavor[key] for key in flavor_keys}
+ image = worker["image"]
+ network = configuration["network"]
+ worker_range = "[0-{}]"
+ name = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id,
+ additional=worker_range.format(worker.get('count', 1) - 1))
+ regexp = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id,
+ additional=r"\d+")
+ workers.append({"name": name, "regexp": regexp, "image": image, "network": network, "flavor": flavor_dict})
+ master = {key: cluster_dict["master"][key] for key in
+ ["name", "private_v4", "public_v4", "public_v6", "cloud_specification"]}
+ master["flavor"] = {key: cluster_dict["master"]["flavor"][key] for key in flavor_keys}
+ return {"master": master, "workers": workers}
+def pass_through(dict_from, dict_to, key_from, key_to=None):
+ """
+ If key is defined in dict_from, set key of dict_to to value of corresponding value of dict_from. Happens in place.
+ @param key_from:
+ @param key_to:
+ @param dict_from:
+ @param dict_to:
+ @return:
+ """
+ if not key_to:
+ key_to = key_from
+ if dict_from.get(key_from):
+ dict_to[key_to] = dict_from[key_from]
+def generate_common_configuration_yaml(cidrs, configuration, cluster_id, ssh_user, default_user):
+ """
+ Generates common_configuration yaml (dict)
+ :param cidrs: str subnet cidrs (provider generated)
+ :param configuration: master configuration (first in file)
+ :param cluster_id: Id of cluster
+ :param ssh_user: user for ssh connections
+ :param default_user: Given default user
+ :return: common_configuration_yaml (dict)
+ """
+ LOG.info("Generating common configuration file...")
+ # print(configuration.get("slurmConf", {}))
+ common_configuration_yaml = {"cluster_id": cluster_id, "cluster_cidrs": cidrs,
+ "default_user": default_user,
+ "local_fs": configuration.get("localFS", False),
+ "local_dns_lookup": configuration.get("localDNSlookup", False),
+ "use_master_as_compute": configuration.get("useMasterAsCompute", True),
+ "enable_slurm": configuration.get("slurm", False),
+ "enable_zabbix": configuration.get("zabbix", False),
+ "enable_nfs": configuration.get("nfs", False),
+ "enable_ide": configuration.get("ide", False),
+ "slurm": configuration.get("slurm", True), "ssh_user": ssh_user,
+ "slurm_conf": mergedeep.merge({}, SLURM_CONF, configuration.get("slurmConf", {}),
+ strategy=mergedeep.Strategy.TYPESAFE_REPLACE)
+ }
+ if configuration.get("nfs"):
+ nfs_shares = configuration.get("nfsShares", [])
+ nfs_shares = nfs_shares + DEFAULT_NFS_SHARES
+ common_configuration_yaml["nfs_mounts"] = [{"src": "/" + nfs_share, "dst": "/" + nfs_share}
+ for nfs_share in nfs_shares]
+ common_configuration_yaml["ext_nfs_mounts"] = [{"src": ext_nfs_share, "dst": ext_nfs_share} for
+ ext_nfs_share in (configuration.get("extNfsShares", []))]
+ if configuration.get("ide"):
+ common_configuration_yaml["ide_conf"] = mergedeep.merge({}, IDE_CONF, configuration.get("ideConf", {}),
+ strategy=mergedeep.Strategy.TYPESAFE_REPLACE)
+ if configuration.get("zabbix"):
+ common_configuration_yaml["zabbix_conf"] = mergedeep.merge({}, ZABBIX_CONF, configuration.get("zabbixConf", {}),
+ strategy=mergedeep.Strategy.TYPESAFE_REPLACE)
+ for from_key, to_key in [("waitForServices", "wait_for_services"), ("ansibleRoles", "ansible_roles"),
+ ("ansibleGalaxyRoles", "ansible_galaxy_roles")]:
+ pass_through(configuration, common_configuration_yaml, from_key, to_key)
+ return common_configuration_yaml
+def generate_ansible_hosts_yaml(ssh_user, configuration, cluster_id):
+ """
+ Generates ansible_hosts_yaml (inventory file).
+ :param ssh_user: str global SSH-username
+ :param configuration: dict
+ :param cluster_id: id of cluster
+ :return: ansible_hosts yaml (dict)
+ """
+ LOG.info("Generating ansible hosts file...")
+ ansible_hosts_yaml = {"master": {"hosts": {"localhost": to_instance_host_dict(ssh_user)}},
+ "workers": {"hosts": {}, "children": {"ephemeral": {"hosts": {}}}}
+ }
+ # vpnwkr are handled like workers on this level
+ workers = ansible_hosts_yaml["workers"]
+ for index, worker in enumerate(configuration.get("workerInstances", [])):
+ name = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id,
+ additional=f"[0:{worker.get('count', 1) - 1}]")
+ worker_dict = to_instance_host_dict(ssh_user, ip="", local=False)
+ if "ephemeral" in worker["type"]:
+ workers["children"]["ephemeral"]["hosts"][name] = worker_dict
+ else:
+ workers["hosts"][name] = worker_dict
+ return ansible_hosts_yaml
+def to_instance_host_dict(ssh_user, ip="localhost", local=True): # pylint: disable=invalid-name
+ """
+ Generates host entry
+ :param ssh_user: str global SSH-username
+ :param ip: str ip
+ :param local: bool
+ :return: host entry (dict)
+ """
+ host_yaml = {"ansible_connection": "local" if local else "ssh",
+ "ansible_python_interpreter": PYTHON_INTERPRETER,
+ "ansible_user": ssh_user}
+ if ip:
+ host_yaml["ip"] = ip
+ return host_yaml
+def get_cidrs(configurations, providers):
+ """
+ Gets cidrs of all subnets in all providers
+ :param configurations: list of configurations (dict)
+ :param providers: list of providers
+ :return:
+ """
+ all_cidrs = []
+ for provider, configuration in zip(providers, configurations):
+ provider_cidrs = {"provider": type(provider).__name__, "provider_cidrs": []}
+ if isinstance(configuration["subnet"], list):
+ for subnet_id_or_name in configuration["subnet"]:
+ subnet = provider.get_subnet_by_id_or_name(subnet_id_or_name)
+ provider_cidrs["provider_cidrs"].append(subnet["cidr"]) # check key again
+ else:
+ subnet = provider.get_subnet_by_id_or_name(configuration["subnet"])
+ provider_cidrs["provider_cidrs"].append(subnet["cidr"])
+ all_cidrs.append(provider_cidrs)
+ return all_cidrs
+def get_ansible_roles(ansible_roles):
+ """
+ Checks if ansible_roles have all necessary values and returns True if so.
+ :param ansible_roles: ansible_roles from master configuration (first configuration)
+ :return: list of valid ansible_roles
+ """
+ ansible_roles_yaml = []
+ for ansible_role in (ansible_roles or []):
+ if ansible_role.get("file") and ansible_role.get("hosts"):
+ ansible_role_dict = {"file": ansible_role["file"], "hosts": ansible_role["hosts"]}
+ for key in ["name", "vars", "vars_file"]:
+ if ansible_role.get(key):
+ ansible_role_dict[key] = ansible_role[key]
+ ansible_roles_yaml.append(ansible_role_dict)
+ else:
+ LOG.warning("Ansible role %s had neither galaxy,git nor url. Not added.", ansible_role)
+ return ansible_roles_yaml
+def get_ansible_galaxy_roles(ansible_galaxy_roles):
+ """
+ Checks if ansible_galaxy_role have all necessary values and adds it to the return list if so.
+ :param ansible_galaxy_roles:
+ :return: list of valid ansible_galaxy_roles
+ """
+ ansible_galaxy_roles_yaml = []
+ for ansible_galaxy_role in (ansible_galaxy_roles or []):
+ if ansible_galaxy_role.get("galaxy") or ansible_galaxy_role.get("git") or ansible_galaxy_role.get("url"):
+ ansible_galaxy_role_dict = {"hosts": ansible_galaxy_role["hosts"]}
+ for key in ["name", "galaxy", "git", "url", "vars", "vars_file"]:
+ if ansible_galaxy_role.get(key):
+ ansible_galaxy_role_dict[key] = ansible_galaxy_role[key]
+ ansible_galaxy_roles_yaml.append(ansible_galaxy_role_dict)
+ else:
+ LOG.warning("Galaxy role %s had neither galaxy,git nor url. Not added.", ansible_galaxy_role)
+ return ansible_galaxy_roles_yaml
+def generate_worker_specification_file_yaml(configurations):
+ """
+ Generates worker_specification_file_yaml
+ :param configurations: list of configurations (dict)
+ :return: worker_specification_yaml
+ """
+ LOG.info("Generating worker specification file...")
+ worker_groups_list = configuration_handler.get_list_by_key(configurations, "workerInstances", False)
+ # create.prepare_configuration guarantees that key is set
+ network_list = configuration_handler.get_list_by_key(configurations, "network", False)
+ worker_specification_yaml = []
+ for worker_groups_provider_list, network in zip(worker_groups_list, network_list):
+ for worker_group in worker_groups_provider_list:
+ worker_specification_yaml.append({"TYPE": worker_group["type"],
+ "IMAGE": worker_group["image"],
+ "NETWORK": network})
+ return worker_specification_yaml
+def write_yaml(path, generated_yaml, alias=False):
+ """
+ Writes generated_yaml to file path with or without alias
+ @param path:
+ @param generated_yaml:
+ @param alias:
+ @return:
+ """
+ LOG.debug("Writing yaml %s", path)
+ with open(path, mode="w+", encoding="UTF-8") as file:
+ if alias:
+ yaml.safe_dump(data=generated_yaml, stream=file)
+ else:
+ yaml.dump(data=generated_yaml, stream=file, Dumper=yaml_dumper.NoAliasSafeDumper)
+def configure_ansible_yaml(providers, configurations, cluster_id):
+ """
+ Generates and writes all ansible-configuration-yaml files.
+ :param providers: list of providers
+ :param configurations: list of configurations (dict)
+ :param cluster_id: id of cluster to create
+ :return:
+ """
+ LOG.info("Writing ansible files...")
+ alias = configurations[0].get("aliasDumper", False)
+ cluster_dict = list_clusters.dict_clusters(providers)[cluster_id]
+ ansible_roles = get_ansible_roles(configurations[0].get("ansibleRoles"))
+ default_user = providers[0].cloud_specification["auth"].get("username", configurations[0].get("sshUser", "Ubuntu"))
+ for path, generated_yaml in [
+ (aRP.WORKER_SPECIFICATION_FILE, generate_worker_specification_file_yaml(configurations)),
+ (aRP.COMMONS_CONFIG_FILE, generate_common_configuration_yaml(cidrs=get_cidrs(configurations, providers),
+ configuration=configurations[0],
+ cluster_id=cluster_id,
+ ssh_user=configurations[0]["sshUser"],
+ default_user=default_user)),
+ (aRP.COMMONS_INSTANCES_FILE, generate_instances_yaml(cluster_dict, configurations[0],
+ providers[0], cluster_id)),
+ (aRP.HOSTS_CONFIG_FILE, generate_ansible_hosts_yaml(configurations[0]["sshUser"], configurations[0],
+ cluster_id)),
+ (aRP.SITE_CONFIG_FILE, generate_site_file_yaml(ansible_roles))]:
+ write_yaml(path, generated_yaml, alias)
diff --git a/bibigrid2/core/utility/command_line_interpreter.py b/bibigrid2/core/utility/command_line_interpreter.py
new file mode 100644
index 000000000..b057bb82b
--- /dev/null
+++ b/bibigrid2/core/utility/command_line_interpreter.py
@@ -0,0 +1,44 @@
+Has necessary methods and variables to interpret the command line
+import argparse
+import os
+STANDARD_CONFIG_INPUT_PATH = os.path.expanduser("~/.config/bibigrid")
+FOLDER_START = ("~/", "/")
+def interpret_command_line():
+ """
+ Interprets commandline. Used in startup.py
+ :return:
+ """
+ parser = argparse.ArgumentParser(description='Bibigrid2 sets up cluster easily inside a cloud environment')
+ parser.add_argument("-v", "--verbose", action="count", default=0,
+ help="Increases logging verbosity. `-v` adds more info to the logfile, "
+ "`-vv` adds debug information to the logfile.")
+ parser.add_argument("-d", "--debug", action='store_true', help="Keeps cluster active. Asks before shutdown. "
+ "Offers termination after create")
+ parser.add_argument("-i", "--config_input", metavar="", help="Path to YAML configurations file. "
+ "Relative paths can be used and start "
+ "at ~/.config/bibigrid", required=True,
+ type=lambda s: s if s.startswith(FOLDER_START) else os.path.join(STANDARD_CONFIG_INPUT_PATH, s))
+ parser.add_argument("-cid", "--cluster_id", metavar="", type=str, default="",
+ help="Cluster id is needed for ide and termination")
+ actions = parser.add_mutually_exclusive_group(required=True)
+ actions.add_argument("-V", "--version", action='store_true', help="Displays version")
+ actions.add_argument("-t", "--terminate_cluster", action='store_true',
+ help="Terminates cluster. Needs cluster-id set.")
+ actions.add_argument("-c", "--create", action='store_true', help="Creates cluster")
+ actions.add_argument("-l", "--list_clusters", action='store_true',
+ help="Lists all running clusters. If cluster-id is set, will list this cluster in detail only")
+ actions.add_argument("-ch", "--check", action='store_true', help="Validates cluster configuration")
+ actions.add_argument("-ide", "--ide", action='store_true',
+ help="Establishes a secured connection to ide. Needs cluster-id set")
+ actions.add_argument("-u", "--update", action='store_true', help="Updates master's playbook. "
+ "Needs cluster-id set, no job running "
+ "and no workers up")
+ args = parser.parse_args()
+ return args
diff --git a/bibigrid2/core/utility/handler/cluster_ssh_handler.py b/bibigrid2/core/utility/handler/cluster_ssh_handler.py
new file mode 100644
index 000000000..78500ade0
+This module gets information about ssh connection.
+import logging
+import os
+from bibigrid2.core.actions import create, list_clusters
+LOG = logging.getLogger("bibigrid")
+def get_ssh_connection_info(cluster_id, master_provider, master_configuration):
+ """
+ Gets master_ip, ssh_user and private key to enable other modules to create an ssh connection to a clusters master
+ @param cluster_id: id of cluster to connect to
+ @param master_provider: master's provider
+ @param master_configuration: master's configuration
+ @return: triple (master_ip, ssh_user, private_key)
+ """
+ # If cluster_id is an ip, cluster_id will be used for master_ip
+ if "." in cluster_id:
+ LOG.info("Interpreting %s as ip since it doesn't match cluster_id", cluster_id)
+ master_ip = cluster_id
+ else:
+ master_ip = list_clusters.get_master_access_ip(cluster_id, master_provider)
+ ssh_user = master_configuration.get("sshUser")
+ public_keys = master_configuration.get("sshPublicKeyFiles")
+ used_private_key = None
+ # first check configuration then if not found take the temporary key
+ if public_keys:
+ public_key = public_keys[0]
+ if isinstance(public_key, str):
+ private_key = public_key.strip(".pub")
+ if os.path.isfile(private_key):
+ used_private_key = private_key
+ if not used_private_key:
+ private_key = os.path.join(create.KEY_FOLDER, create.KEY_NAME.format(cluster_id=cluster_id))
+ if os.path.isfile(private_key):
+ used_private_key = private_key
+ return master_ip, ssh_user, used_private_key
+This module contains methods to read the configuration and cloud specification.
+import logging
+import os
+import mergedeep
+import yaml
+CLOUDS_YAML_PATHS = ["~/.config/bibigrid", "/etc/bibigrid", ""]
+CLOUDS_YAML = "clouds.yaml"
+CLOUDS_PUBLIC_YAML = "clouds-public.yaml"
+CLOUD_ROOT_KEY = "clouds"
+CLOUD_PUBLIC_ROOT_KEY = "public-clouds"
+LOG = logging.getLogger("bibigrid")
+def read_configuration(path="bibigrid.yml"):
+ """
+ Reads yaml from file and returns the list of all configurations
+ :param path: Path to yaml file
+ :return: configurations (dict)
+ """
+ configuration = None
+ if os.path.isfile(path):
+ with open(path, mode="r", encoding="UTF-8") as stream:
+ try:
+ configuration = yaml.safe_load(stream)
+ except yaml.YAMLError as exc:
+ LOG.warning("Couldn't read configuration %s: %s", path, exc)
+ else:
+ LOG.warning("No such configuration file %s.", path)
+ return configuration
+def get_list_by_key(configurations, key, get_empty=True):
+ """
+ Returns a list of objects which are value to the key.
+ :param get_empty: if true empty configurations return None
+ :param configurations: YAML of configuration File containing the configuration-data for each provider
+ :param key: Key that is looked out for
+ :return: List of values of said key through all configs
+ """
+ return [configuration.get(key) for configuration in configurations if configuration.get(key) or get_empty]
+# def get_dict_list_by_key_list(configurations, keys, get_empty=True):
+# return [{key: configuration.get(key) for key in keys if configuration.get(key) or get_empty}
+# for configuration in configurations]
+def find_file_in_folders(file_name, folders):
+ """
+ Searches all folders for a file with name file_name, loads (expects yaml) the first match and returns the dict
+ @param file_name: name of the file to look for
+ @param folders: folders to search for file named file_name
+ @return: dict of match content or None if not found
+ """
+ for folder_path in folders:
+ file_path = os.path.expanduser(os.path.join(folder_path, file_name))
+ if os.path.isfile(file_path):
+ LOG.debug("File %s found in folder %s.", file_name, folder_path)
+ return read_configuration(file_path)
+ LOG.debug("File %s in folder %s not found.", file_name, folder_path)
+ return None
+def get_clouds_files():
+ """
+ Wrapper to call find_file_in_folders with the right arguments to find the clouds.yaml and clouds-public.yaml
+ @return: tuple of dicts containing the clouds.yaml and clouds-public.yaml data or None if not found.
+ """
+ clouds_yaml = find_file_in_folders(CLOUDS_YAML, CLOUDS_YAML_PATHS)
+ clouds_public_yaml = find_file_in_folders(CLOUDS_PUBLIC_YAML, CLOUDS_YAML_PATHS)
+ clouds = None
+ clouds_public = None
+ if clouds_yaml:
+ clouds = clouds_yaml.get(CLOUD_ROOT_KEY)
+ if not clouds:
+ LOG.warning("%s is not valid. Must contain key '%s:'", CLOUDS_YAML, CLOUD_ROOT_KEY)
+ else:
+ LOG.warning("No %s at %s! Please copy your %s to one of those listed folders. Aborting...",
+ if clouds_public_yaml:
+ clouds_public = clouds_public_yaml.get(CLOUD_PUBLIC_ROOT_KEY)
+ if not clouds_public:
+ LOG.warning("%s is not valid. Must contain key '%s'", CLOUDS_PUBLIC_YAML, CLOUD_PUBLIC_ROOT_KEY)
+ return clouds, clouds_public
+def get_cloud_specification(cloud_name, clouds, clouds_public):
+ """
+ As in openstack cloud_public_specification will be overwritten by cloud_private_specification
+ :param cloud_name: name of the cloud to look for in clouds.yaml
+ :param clouds: dict containing the data loaded from clouds.yaml
+ :param clouds_public: dict containing the data loaded from clouds-public.yaml
+ :return:
+ """
+ cloud_full_specification = {}
+ cloud_private_specification = clouds.get(cloud_name)
+ if cloud_private_specification:
+ cloud_full_specification = cloud_private_specification
+ public_cloud_name = cloud_private_specification.get(CLOUDS_PUBLIC_NAME_KEY)
+ if public_cloud_name and clouds_public:
+ LOG.debug("Trying to find profile...")
+ cloud_public_specification = clouds_public.get(public_cloud_name)
+ if not cloud_public_specification:
+ LOG.warning("%s is not a valid profile name. "
+ "Must be contained under key '%s'", public_cloud_name, CLOUD_PUBLIC_ROOT_KEY)
+ else:
+ LOG.debug("Profile found. Merging begins...")
+ try:
+ mergedeep.merge(cloud_full_specification, cloud_public_specification,
+ strategy=mergedeep.Strategy.TYPESAFE_REPLACE)
+ except TypeError as exc:
+ LOG.warning("Existing %s and %s configuration keys don't match in type: %s",
+ return {}
+ else:
+ LOG.debug("Using only clouds.yaml since no clouds-public profile is set.")
+ else:
+ LOG.warning("%s is not a valid cloud name. Must be contained under key '%s'", cloud_name, CLOUD_ROOT_KEY)
+ return cloud_full_specification
+def get_cloud_specifications(configurations):
+ """
+ Calls get_cloud_specification to get the cloud_specification for every configuration
+ @param configurations:
+ @return: list of dicts: cloud_specifications of every configuration
+ """
+ clouds, clouds_public = get_clouds_files()
+ cloud_specifications = []
+ if isinstance(clouds, dict):
+ for configuration in configurations:
+ cloud = configuration.get(CLOUD_CONFIGURATION_KEY)
+ if cloud:
+ cloud_specifications.append(get_cloud_specification(cloud, clouds, clouds_public)) # might be None
+ return cloud_specifications
+This module holds methods to return the logfile's path.
+import logging
+LOG = logging.getLogger("bibigrid")
+def get_logging_path():
+ """
+ Returns the path were the logfile is stored
+ @return: the path were the logfile is stored
+ """
+ for handler in LOG.getLoggerClass().root.handlers:
+ if hasattr(handler, 'baseFilename'):
+ log_path = handler.baseFilename
+ return log_path
+ return None
+This module contains different selectors to pick and create a connection to the right provider.
+import logging
+from bibigrid2.core.utility.handler import configuration_handler
+from bibigrid2.openstack import openstack_provider
+PROVIDER_NAME_DICT = {"openstack": openstack_provider.OpenstackProvider}
+PROVIDER_CLASS_DICT = {provider.__name__: provider for provider in PROVIDER_NAME_DICT.values()}
+LOG = logging.getLogger("bibigrid")
+def get_provider_by_class_name(provider_name, provider_dict=PROVIDER_CLASS_DICT): # pylint: disable=dangerous-default-value
+ """
+ Returns provider that is associated with the key provider_name in provider_dict.
+ Otherwise a KeyError is thrown.
+ :param provider_name: key of provider_dict
+ :return: provider
+ """
+ return provider_dict[provider_name]
+def get_provider_by_name(provider_name, provider_dict=PROVIDER_NAME_DICT): # pylint: disable=dangerous-default-value
+ """
+ Returns provider that is associated with the key provider_name in provider_dict.
+ Otherwise a KeyError is thrown.
+ :param provider_name: key of provider_dict
+ :return: provider
+ """
+ return provider_dict.get(provider_name)
+def get_provider_list_by_name_list(provider_name_list, cloud_specifications):
+ """
+ Returns provider list for given provider_name_list
+ If name is not found in PROVIDER_NAME_DICT, PROVIDER_CLASS_DICT is tried instead.
+ If not found in both a key error is thrown.
+ :param provider_name_list: list of provider names
+ :param cloud_specifications: list of cloud specifications
+ :return: list of providers
+ """
+ provider_list = [
+ (get_provider_by_name(provider_name) or get_provider_by_class_name(provider_name))(cloud_specification)
+ for provider_name, cloud_specification in zip(provider_name_list, cloud_specifications)]
+ return provider_list
+def get_providers(configurations):
+ """
+ Reads list of provider_names from configurations.
+ Determines list of providers by provider_names and returns it.
+ If providers don't match a key error is thrown and the program exits with failure state 1.
+ :param configurations:
+ :return:
+ """
+ cloud_specifications = configuration_handler.get_cloud_specifications(configurations)
+ if cloud_specifications:
+ try:
+ provider_names = configuration_handler.get_list_by_key(configurations, "infrastructure")
+ return get_provider_list_by_name_list(provider_names, cloud_specifications)
+ except KeyError as exc:
+ LOG.warning("Check infrastructure in configurations! Key: %s", str(exc))
+ return None
+This module handles ssh and sftp connections to master and vpnwkrs. It also holds general execution routines used to
+setup the Cluster.
+import logging
+import os
+import time
+import socket
+import paramiko
+import yaml
+from bibigrid2.models.exceptions import ConnectionException, ExecutionException
+from bibigrid2.core.utility import ansible_commands as aC
+PRIVATE_KEY_FILE = ".ssh/id_ecdsa" # to name bibigrid-temp keys identically on remote
+ (f"chmod 600 {PRIVATE_KEY_FILE}","Adjust private key permissions."),
+# ANSIBLE_START = [aC.WAIT_READY, aC.UPDATE, aC.MV_ANSIBLE_CONFIG, aC.EXECUTE] # another UPDATE seems to not necessary.
+VPN_SETUP = ["echo Example"]
+LOG = logging.getLogger("bibigrid")
+def get_ac_command(master_provider, name):
+ """
+ Get command to write application credentials to remote (
+ @param master_provider: provider that holds the master
+ @param name: how the application credential shall be called
+ @return: command to execute on remote to create application credential
+ """
+ master_cloud_specification = master_provider.cloud_specification
+ auth = master_cloud_specification["auth"]
+ ac_clouds_yaml = {"clouds": {"master": None}}
+ if auth.get("application_credential_id") and auth.get("application_credential_secret"):
+ wanted_keys = ["auth", "region_name", "interface", "identity_api_version", "auth_type"]
+ ac_cloud_specification = {k: master_cloud_specification[k] for k in wanted_keys if k in
+ master_cloud_specification}
+ else:
+ wanted_keys = ["region_name", "interface", "identity_api_version"]
+ ac = master_provider.create_application_credential(name=name) # pylint: disable=invalid-name
+ ac_dict = {"application_credential_id": ac["id"], "application_credential_secret": ac["secret"],
+ "auth_type": "v3applicationcredential", "auth_url": auth["auth_url"]}
+ ac_cloud_specification = {k: master_cloud_specification[k] for k in wanted_keys if k in
+ master_cloud_specification}
+ ac_cloud_specification.update(ac_dict)
+ ac_clouds_yaml["clouds"]["master"] = ac_cloud_specification
+ return (f"echo '{yaml.safe_dump(ac_clouds_yaml)}' | sudo install -D /dev/stdin /etc/openstack/clouds.yaml",
+ "Copy application credentials.")
+def get_add_ssh_public_key_commands(ssh_public_key_files):
+ """
+ Builds and returns the necessary commands to add given public keys to remote for additional access.
+ :param ssh_public_key_files: public keys to add
+ :return: list of public key add commands
+ """
+ commands = []
+ if ssh_public_key_files:
+ for ssh_public_key_file in ssh_public_key_files:
+ with open(ssh_public_key_file, mode="r", encoding="UTF-8") as ssh_public_key:
+ commands.append((f"echo {ssh_public_key.readline().strip()} >> .ssh/authorized_keys",
+ f"Add SSH Key {ssh_public_key_file}."))
+ return commands
+def copy_to_server(sftp, localpath, remotepath):
+ """
+ Recursively copies files and folders to server.
+ If a folder is given as localpath, the structure within will be kept.
+ :param sftp: sftp connection
+ :param localpath: file or folder locally
+ :param remotepath: file or folder locally
+ :return:
+ """
+ LOG.debug("Copy %s to %s...", localpath, remotepath)
+ if os.path.isfile(localpath):
+ sftp.put(localpath, remotepath)
+ else:
+ try:
+ sftp.mkdir(remotepath)
+ except OSError:
+ pass
+ for filename in os.listdir(localpath):
+ copy_to_server(sftp, localpath + "/" + filename, remotepath + "/" + filename)
+def is_active(client, floating_ip_address, private_key, username, timeout=5):
+ """
+ Checks if connection is possible and therefore if server is active.
+ Raises paramiko.ssh_exception.NoValidConnectionsError if timeout is reached
+ :param client: created client
+ :param floating_ip_address: ip to connect to
+ :param private_key: SSH-private_key
+ :param username: SSH-username
+ :param timeout: how long to wait between ping
+ (waiting grows quadratically till 2**timeout before accepting failure)
+ """
+ attempts = 0
+ establishing_connection = True
+ while establishing_connection:
+ try:
+ client.connect(hostname=floating_ip_address, username=username, pkey=private_key, timeout=5, auth_timeout=5)
+ establishing_connection = False
+ except paramiko.ssh_exception.NoValidConnectionsError as exc:
+ LOG.info(f"Attempting to connect to {floating_ip_address}... This might take a while", )
+ if attempts < timeout:
+ time.sleep(2 ** attempts)
+ attempts += 1
+ else:
+ LOG.error(f"Attempt to connect to {floating_ip_address} failed.")
+ raise ConnectionException(exc) from exc
+ except socket.timeout as exc:
+ LOG.warning("Socket timeout exception occurred. Try again ...")
+ if attempts < timeout:
+ attempts += 1
+ else:
+ LOG.error(f"Attempt to connect to {floating_ip_address} failed, due to a socket timeout.")
+ raise ConnectionException(exc) from exc
+ except TimeoutError as exc: # pylint: disable=duplicate-except
+ LOG.error("The attempt to connect to %s failed. Possible known reasons:"
+ "\n\t-Your network's security group doesn't allow SSH.", floating_ip_address)
+ raise ConnectionException(exc) from exc
+def line_buffered(f):
+ """
+ https://stackoverflow.com/questions/25260088/paramiko-with-continuous-stdout
+ temporary hangs?
+ :param f:
+ :return:
+ """
+ line_buf = b""
+ while not f.channel.exit_status_ready():
+ line_buf += f.read(1024)
+ if line_buf.endswith(b'\n'):
+ yield line_buf
+ line_buf = b''
+def execute_ssh_cml_commands(client, commands):
+ """
+ Executes commands and logs exit_status accordingly.
+ :param client: Client with connection to remote
+ :param commands: Commands to execute on remote
+ """
+ for command in commands:
+ ssh_stdin, ssh_stdout, ssh_stderr = client.exec_command(command[0]) # pylint: disable=unused-variable
+ ssh_stdout.channel.set_combine_stderr(True)
+ LOG.info(f"REMOTE: {command[1]}")
+ while True:
+ line = ssh_stdout.readline()
+ if len(line) == 0:
+ break
+ if "[BIBIGRID]" in line:
+ LOG.info(f"REMOTE: {line.strip()}")
+ else:
+ LOG.debug(f"REMOTE: {line.strip()}")
+ # get exit status
+ exit_status = ssh_stdout.channel.recv_exit_status()
+ # close handler
+ ssh_stdout.close()
+ if exit_status:
+ msg = f"{command[1]} ... Exit status: {exit_status}"
+ LOG.warning(msg)
+ raise ExecutionException(msg)
+def ansible_preparation(floating_ip, private_key, username, commands=None, filepaths=None):
+ """
+ Installs python and pip. Then installs ansible over pip.
+ Copies private key to instance so cluster-nodes are reachable and sets permission as necessary.
+ Copies additional files and executes additional commands if given.
+ The playbook is copied later, because it needs all servers setup and is not time intensive.
+ See: create.update_playbooks
+ :param floating_ip: public ip of server to ansible-prepare
+ :param private_key: generated private key of all cluster-server
+ :param username: username of all server
+ :param commands: additional commands to execute
+ :param filepaths: additional files to copy: (localpath, remotepath)
+ """
+ if filepaths is None:
+ filepaths = []
+ if commands is None:
+ commands = []
+ LOG.info("Ansible preparation...")
+ commands = ANSIBLE_SETUP + commands
+ filepaths.append((private_key, PRIVATE_KEY_FILE))
+ execute_ssh(floating_ip, private_key, username, commands, filepaths)
+def execute_ssh(floating_ip, private_key, username, commands=None, filepaths=None):
+ """
+ Executes commands on remote and copies files given in filepaths
+ :param floating_ip: public ip of remote
+ :param private_key: key of remote
+ :param username: username of remote
+ :param commands: commands
+ :param filepaths: filepaths (localpath, remotepath)
+ """
+ if commands is None:
+ commands = []
+ paramiko_key = paramiko.ECDSAKey.from_private_key_file(private_key)
+ with paramiko.SSHClient() as client:
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ try:
+ is_active(client=client,
+ floating_ip_address=floating_ip,
+ username=username,
+ private_key=paramiko_key)
+ except ConnectionException as exc:
+ LOG.error(f"Couldn't connect to floating ip {floating_ip} using private key {private_key}.")
+ raise exc
+ else:
+ if filepaths:
+ sftp = client.open_sftp()
+ for localpath, remotepath in filepaths:
+ copy_to_server(sftp=sftp, localpath=localpath, remotepath=remotepath)
+ LOG.debug("SFTP: Files %s copied.", filepaths)
+ if commands:
+ execute_ssh_cml_commands(client, commands)
+Generates ids and munge keys
+import shortuuid
+from bibigrid2.core.actions import create
+CLUSTER_UUID_ALPHABET = '0123456789abcdefghijkmnopqrstuvwxyz'
+def generate_cluster_id():
+ """
+ Generates an encrypted shortUUID with length MAX_ID_LENGTH
+ :return:
+ """
+ uuid = shortuuid.ShortUUID()
+ uuid.set_alphabet(CLUSTER_UUID_ALPHABET)
+ return uuid.random(MAX_ID_LENGTH)
+def generate_safe_cluster_id(providers):
+ """
+ Generates a cluster_id and checks if cluster_id is not in use. When a unique id is found it is returned
+ :param providers: providers to check whether they use said cluster_id
+ :return: cluster_id
+ """
+ id_is_unique = False
+ cluster_id = None
+ while not id_is_unique:
+ cluster_id = generate_cluster_id()
+ id_is_unique = is_unique_cluster_id(cluster_id, providers)
+ return cluster_id
+def is_unique_cluster_id(cluster_id, providers):
+ """
+ Checks if cluster_id is not in use on any provider
+ :param cluster_id: generated cluster_ird
+ :param providers: providers to check
+ :return: True if cluster_id is unique. False else.
+ """
+ for provider in providers:
+ for server in provider.list_servers():
+ master = create.MASTER_IDENTIFIER(cluster_id=cluster_id)
+ vpnwkr = create.VPN_WORKER_IDENTIFIER(cluster_id=cluster_id)
+ worker = create.WORKER_IDENTIFIER(cluster_id=cluster_id)
+ if server["name"] in [master, vpnwkr, worker]:
+ return False
+ return True
+def generate_munge_key():
+ """
+ Generates a munge key (UUID) for slurm
+ :return:
+ """
+ return shortuuid.ShortUUID().random(32)
+Paths that are used by Ansible. Especially playbook, vars files and Co.
+import os
+import bibigrid2.core.utility.paths.basic_path as bP
+ANSIBLE_HOSTS: str = "ansible_hosts"
+COMMON_YML: str = "common.yml"
+SITE_YML: str = "site.yml"
+REQUIREMENTS_YML: str = "requirements.yml"
+UPLOAD_PATH: str = "/tmp/roles/"
+VARS_PATH: str = "vars/"
+ROLES_PATH: str = "roles/"
+LOGIN_YML: str = VARS_PATH + "login.yml"
+INSTANCES_YML: str = VARS_PATH + "instances.yml"
+CONFIG_YML: str = VARS_PATH + "common_configuration.yml"
+WORKER_SPECIFICATION_YML: str = VARS_PATH + "worker_specification.yml"
+ADDITIONAL_ROLES_PATH: str = ROLES_PATH + "additional/"
+DEFAULT_IP_FILE = VARS_PATH + "{{ ansible_default_ipv4.address }}.yml"
+# ANSIBLE_CFG = "ansible.cfg"
+PLAYBOOK = "playbook/"
+PLAYBOOK_PATH_REMOTE: str = os.path.join("/opt/", PLAYBOOK)
+# PLAYBOOK_PATH_REMOTE_SLURM: str = os.path.join("/opt/slurm/", PLAYBOOK)
+Module containing the most basic paths. Must stay at the same place relative to root.
+import os
+from pathlib import Path
+RESOURCES = "resources"
+# if the relative path from this file to resources is altered, the next line must be adapted or files will not be found.
+ROOT_PATH = Path(__file__).absolute().parents[4]
+Paths that are used by bin script copying
+import os
+import bibigrid2.core.utility.paths.basic_path as bP
+BIN: str = "bin/"
+BIN_PATH: str = os.path.join(bP.RESOURCES_PATH, BIN)
+Validates configuration and cloud_specification
+import logging
+import os
+from bibigrid2.core.utility.handler import configuration_handler
+ACCEPTED_KEY_IDENTIFIERS = {"RSA": 4096, "ECDSA": 521, "ED25519": 256}
+LOG = logging.getLogger("bibigrid")
+def evaluate(check_name, check_result):
+ """
+ Logs check_resul as warning if failed and as success if succeeded.
+ :param check_name:
+ :param check_result:
+ :return:
+ """
+ if check_result:
+ LOG.info("Checking %s: Success", check_name)
+ else:
+ LOG.warning("Checking %s: Failure", check_name)
+ return check_result
+def check_provider_data(provider_data_list, provider_count):
+ """
+ Checks if all provider datas are unique and if enough providers are given
+ #ToDo for multiple cloud locations additional provider data needs to be added
+ :param provider_data_list: list of all provider data
+ :param provider_count: number of providers
+ :return: True if enough providers are given and all providers are unique
+ """
+ LOG.info("Checking provider names")
+ success = True
+ duplicates = []
+ seen = []
+ for elem in provider_data_list:
+ if elem in seen:
+ duplicates.append(elem)
+ else:
+ seen.append(elem)
+ if duplicates:
+ LOG.warning("Duplicate provider(s) %s. For each provider you can only create one configuration. "
+ "Please check your configurations.", duplicates)
+ success = False
+ else:
+ LOG.info("All providers are unique.")
+ if not len(provider_data_list) == provider_count:
+ LOG.warning("Not enough providers given. %s/%s", len(provider_data_list), provider_count)
+ success = False
+ else:
+ LOG.info("Enough providers given. %s/%s", len(provider_data_list), provider_count)
+ return success
+def evaluate_ssh_public_key_file_security(ssh_public_key_file):
+ """
+ Checks if key encryption is sufficiently strong. Uses empiric values and therefore will fail if key type is unknown
+ @param ssh_public_key_file:
+ @return:
+ """
+ success = True
+ # length, key, comment list, identifier_dirty
+ key_info = os.popen(f'ssh-keygen -l -f {ssh_public_key_file}').read().split()
+ length = key_info[0]
+ identifier_clean = key_info[-1].strip("()\n")
+ minimum_size = ACCEPTED_KEY_IDENTIFIERS.get(identifier_clean)
+ if not minimum_size:
+ LOG.warning("sshPublicKey '%s' is %s. Which secure length is unknown to bibigrid2.\n"
+ "Known encryptions are (with minimum size): %s",
+ ssh_public_key_file, identifier_clean, ACCEPTED_KEY_IDENTIFIERS)
+ else:
+ LOG.info("sshPublicKey '%s' is a known encryption.", ssh_public_key_file)
+ if minimum_size > int(length):
+ LOG.warning("sshPublicKey '%s' is not long enough! %s should be >= %s, but is %s",
+ ssh_public_key_file, identifier_clean, minimum_size, int(length))
+ else:
+ LOG.info("sshPublicKey '%s' is long enough (%s/%s)!", ssh_public_key_file, int(length), minimum_size)
+ return success
+def has_enough(maximum, needed, keeper, thing):
+ """
+ Method logs and compares whether enough free things are available
+ :param maximum: maximum (available) resources of thing
+ :param needed: minimum needed to run
+ :param keeper: description of the object having the thing that is checked (for logging)
+ :param thing: description of what resource is checked (RAM for example) (for logging)
+ :return: True if maximum is larger or equal to the needed
+ """
+ success = True
+ if maximum >= needed:
+ LOG.info("%s has enough %s: %s/%s", keeper, thing, needed, maximum)
+ elif maximum < 0:
+ LOG.warning("%s returns no valid value for %s: %s/%s -- Ignored.", keeper, thing, needed, maximum)
+ else:
+ LOG.warning("%s has not enough %s: %s/%s", keeper, thing, needed, maximum)
+ success = False
+ return success
+def check_clouds_yaml_security():
+ """
+ Checks security of all clouds in clouds.yaml i.e. whether sensitive information is stored in clouds-public.yaml
+ @return: True if no sensitive information is stored in clouds-public.yaml. False else.
+ """
+ success = True
+ LOG.info("Checking validity of entire clouds.yaml and clouds-public.yaml")
+ clouds, clouds_public = configuration_handler.get_clouds_files() # pylint: disable=unused-variable
+ if clouds_public:
+ for cloud in clouds_public:
+ if clouds_public[cloud].get("profile"):
+ LOG.warning(f"{cloud}: Profiles should be placed in clouds.yaml not clouds-public.yaml! "
+ f"Key ignored.")
+ success = False
+ if clouds_public[cloud].get("auth"):
+ for key in ["password", "username", "application_credential_id", "application_credential_secret"]:
+ if clouds_public[cloud]["auth"].get(key):
+ LOG.warning(f"{cloud}: {key} shouldn't be shared. Move {key} to clouds.yaml!")
+ success = False
+ return success
+def check_cloud_yaml(cloud_specification):
+ """
+ Check if cloud_specification is valid i.e. contains the necessary authentification data.
+ @param cloud_specification: dict to check whether it is a valid cloud_specification
+ @return: True if cloud_specification is valid. False else.
+ """
+ success = True
+ if cloud_specification:
+ keys = cloud_specification.keys()
+ auth = cloud_specification.get("auth")
+ if auth:
+ auth_keys = auth.keys()
+ if not ("password" in auth_keys and "username" in auth_keys) \
+ and not ("auth_type" in keys and "application_credential_id" in auth_keys and
+ "application_credential_secret" in auth_keys):
+ LOG.warning("Insufficient authentication information. Needs either password and username or "
+ "if using application credentials: "
+ "auth_type, application_credential_id and application_credential_secret.")
+ success = False
+ if "auth_url" not in auth_keys:
+ LOG.warning("Authentification URL auth_url is missing.")
+ success = False
+ else:
+ LOG.warning("Missing all auth information!")
+ success = False
+ if "region_name" not in keys:
+ LOG.warning("region_name is missing.")
+ success = False
+ else:
+ LOG.warning("Missing all cloud_specification information!")
+ return success
+class ValidateConfiguration:
+ """
+ This class contains necessary algorithms to validate configuration files
+ """
+ def __init__(self, configurations, providers):
+ """
+ Sets configurations, providers and prepares the required_resources_dict.
+ While executing the checks, needed resources are counted.
+ In the end check_quotas will decide whether enough resources are available.
+ :param configurations: List of configurations (dicts)
+ :param providers: List of providers
+ """
+ self.configurations = configurations
+ self.providers = providers
+ self.required_resources_dict = {'total_cores': 0, 'floating_ips': 0, 'instances': 0, 'total_ram': 0,
+ 'Volumes': 0, 'VolumeGigabytes': 0, 'Snapshots': 0, 'Backups': 0,
+ 'BackupGigabytes': 0}
+ def validate(self):
+ """
+ Validation of the configuration file with the selected cloud provider.
+ The validation steps are as follows:
+ Check connection can be established
+ Check provider uniqueness
+ Check servergroup
+ Check instances are available
+ Check images and volumes are available
+ Check network and subnet are available
+ Check quotas
+ :return:
+ """
+ success = bool(self.providers)
+ LOG.info("Validating config file...")
+ success = check_provider_data(
+ configuration_handler.get_list_by_key(self.configurations, "infrastructure"),
+ len(self.configurations)) and success
+ if not success:
+ LOG.warning("Providers not set correctly in configuration file. Check log for more detail.")
+ return success
+ checks = [("master/vpn", self.check_master_vpn_worker), ("servergroup", self.check_server_group),
+ ("instances", self.check_instances), ("volumes", self.check_volumes),
+ ("network", self.check_network), ("quotas", self.check_quotas),
+ ("sshPublicKeyFiles", self.check_ssh_public_key_files), ("cloudYamls", self.check_clouds_yamls)]
+ if success:
+ for check_name, check_function in checks:
+ success = evaluate(check_name, check_function()) and success
+ return success
+ def check_master_vpn_worker(self):
+ """
+ Checks if first configuration has a masterInstance defined
+ and every other configuration has a vpnInstance defined.
+ If one is missing said provider wouldn't be reachable over the cluster, because no floating IP would be given.
+ :return: True if first configuration has a masterInstance and every other a vpnInstance
+ """
+ LOG.info("Checking master/vpn")
+ success = True
+ if not self.configurations[0].get("masterInstance") or self.configurations[0].get("vpnInstance"):
+ success = False
+ for configuration in self.configurations[1:]:
+ if not configuration.get("vpnInstance") or configuration.get("masterInstance"):
+ success = False
+ return success
+ def check_provider_connections(self):
+ """
+ Checks if all providers are reachable
+ :return: True if all providers are reachable
+ """
+ success = True
+ providers_unconnectable = []
+ for provider in self.providers:
+ if not provider.conn:
+ providers_unconnectable.append(provider.name)
+ if providers_unconnectable:
+ LOG.warning("API connection to %s not successful. Please check your configuration.",
+ providers_unconnectable)
+ success = False
+ return success
+ def check_instances(self):
+ """
+ Checks if all instances exist and image and instance-type are compatible
+ :return: true if image and instance-type (flavor) exist for all instances and are compatible
+ """
+ LOG.info("Checking instance images and type")
+ success = True
+ configuration = None
+ try:
+ for configuration, provider in zip(self.configurations, self.providers):
+ self.required_resources_dict["floating_ips"] += 1
+ if configuration.get("masterInstance"):
+ success = self.check_instance("masterInstance", configuration["masterInstance"], provider) \
+ and success
+ else:
+ success = self.check_instance("vpnInstance", configuration["vpnInstance"], provider) and success
+ for worker in configuration.get("workerInstances", []):
+ success = self.check_instance("workerInstance", worker, provider) and success
+ except KeyError as exc:
+ LOG.warning("Not found %s, but required in configuration %s.", str(exc), configuration)
+ success = False
+ return success
+ def check_instance(self, instance_name, instance, provider):
+ """
+ Checks if instance image exists and whether it is compatible with the defined instance/server type (flavor).
+ :param instance_name: containing name for logging purposes
+ :param instance: dict containing image, type and count (count is not used)
+ :param provider: provider
+ :return: true if type and image compatible and existing
+ """
+ self.required_resources_dict["instances"] += instance.get("count") or 1
+ instance_image_id_or_name = instance["image"]
+ instance_image = provider.get_image_by_id_or_name(image_id_or_name=instance_image_id_or_name)
+ if not instance_image:
+ LOG.warning("Instance %s image: %s not found", instance_name, instance_image_id_or_name)
+ print("Available active images:")
+ print("\n".join(provider.get_active_images()))
+ return False
+ if instance_image["status"] != "active":
+ LOG.warning("Instance %s image: %s not active", instance_name, instance_image_id_or_name)
+ print("Available active images:")
+ print("\n".join(provider.get_active_images))
+ return False
+ LOG.info("Instance %s image: %s found", instance_name, instance_image_id_or_name)
+ instance_type = instance["type"]
+ return self.check_instance_type_image_combination(instance_type, instance_image, provider)
+ def check_instance_type_image_combination(self, instance_type, instance_image, provider):
+ """
+ Checks, if enough ram, disk space for instance_image are provided by instance_type on provider.
+ :param instance_type
+ :param instance_image
+ :param provider
+ :return true, if enough resources available
+ """
+ success = True
+ # check
+ flavor = provider.get_flavor(instance_type)
+ if not flavor:
+ LOG.warning("Flavor %s does not exist.", instance_type)
+ print("Available flavors:")
+ print("\n".join(provider.get_active_flavors()))
+ return False
+ type_max_disk_space = flavor["disk"]
+ type_max_ram = flavor["ram"]
+ image_min_disk_space = provider.get_image_by_id_or_name(instance_image)["min_disk"]
+ image_min_ram = provider.get_image_by_id_or_name(instance_image)["min_ram"]
+ for maximum, needed, thing in [(type_max_disk_space, image_min_disk_space, "disk space"),
+ (type_max_ram, image_min_ram, "ram")]:
+ success = has_enough(maximum, needed, f"Type {instance_type}", thing) and success
+ # prepare check quotas
+ self.required_resources_dict["total_ram"] += type_max_ram
+ self.required_resources_dict["total_cores"] += flavor["vcpus"]
+ return success
+ def check_volumes(self):
+ """
+ Checking if volume or snapshot exists for all volumes
+ :return: True if all snapshot and volumes are found. Else false.
+ """
+ LOG.info("Checking volumes...")
+ success = True
+ for configuration, provider in zip(self.configurations, self.providers):
+ volume_identifiers = configuration.get("masterMounts")
+ if volume_identifiers:
+ # check individually if volumes exist
+ for volume_identifier in volume_identifiers:
+ if ":" in volume_identifier:
+ volume_name_or_id = volume_identifier[:volume_identifier.index(":")]
+ else:
+ volume_name_or_id = volume_identifier
+ volume = provider.get_volume_by_id_or_name(volume_name_or_id)
+ if not volume:
+ snapshot = provider.get_volume_snapshot_by_id_or_name(volume_name_or_id)
+ if not snapshot:
+ LOG.warning("Neither Volume nor Snapshot '%s' found", volume_name_or_id)
+ success = False
+ else:
+ LOG.info("Snapshot '%s' found", volume_name_or_id)
+ self.required_resources_dict["Volumes"] += 1
+ self.required_resources_dict["VolumeGigabytes"] += snapshot["size"]
+ else:
+ LOG.info(f"Volume '{volume_name_or_id}' found")
+ return success
+ def check_network(self):
+ """
+ Check if network (or subnet) is accessible
+ :return True if any given network or subnet is accessible by provider
+ """
+ LOG.info("Checking network...")
+ success = True
+ for configuration, provider in zip(self.configurations, self.providers):
+ network_name_or_id = configuration.get("network")
+ if network_name_or_id:
+ network = provider.get_network_by_id_or_name(network_name_or_id)
+ if not network:
+ LOG.warning(f"Network '{network_name_or_id}' not found", network_name_or_id)
+ success = False
+ else:
+ LOG.info(f"Network '{subnet_name_or_id}' found")
+ subnet_name_or_id = configuration.get("subnet")
+ if subnet_name_or_id:
+ subnet = provider.get_subnet_by_id_or_name(subnet_name_or_id)
+ if not subnet:
+ LOG.warning(f"Subnet '{subnet_name_or_id}' not found")
+ success = False
+ else:
+ LOG.info(f"Subnet '{subnet_name_or_id}' found")
+ return bool(success and (network_name_or_id or subnet_name_or_id))
+ def check_server_group(self):
+ """
+ :return: True if server group accessible
+ """
+ success = True
+ for configuration, provider in zip(self.configurations, self.providers):
+ server_group_name_or_id = configuration.get("serverGroup")
+ if server_group_name_or_id:
+ server_group = provider.get_server_group_by_id_or_name(server_group_name_or_id)
+ if not server_group:
+ LOG.warning("ServerGroup '%s' not found", server_group_name_or_id)
+ success = False
+ else:
+ LOG.info("ServerGroup '%s' found", server_group_name_or_id)
+ return success
+ def check_quotas(self):
+ """
+ Gets remaining resources from the provider and compares them to the needed resources.
+ Needed resources are set during the other checks.
+ Covered resources are: cores, floating_ips, instances, ram, volumes, volumeGigabytes, snapshots, backups and
+ backupGigabytes. If a concrete provider implementation is unable to return remaining resources a maximum number
+ is returned to make the check not fail because of the missing API implementation.
+ :return: True if check succeeded. Else false.
+ """
+ LOG.info("Checking quotas")
+ success = True
+ LOG.info("required/available")
+ for provider in self.providers:
+ free_resources_dict = provider.get_free_resources()
+ for key, value in self.required_resources_dict.items():
+ success = has_enough(free_resources_dict[key],
+ value,
+ f"Project {self.providers[0].cloud_specification['identifier']}",
+ key) and success
+ return success
+ def check_ssh_public_key_files(self):
+ """
+ Checks if keys listed in the config exist
+ :return: True if check succeeded. Else false.
+ """
+ success = True
+ for configuration in self.configurations:
+ for ssh_public_key_file in configuration.get("sshPublicKeyFiles") or []:
+ if not os.path.isfile(ssh_public_key_file):
+ LOG.warning("sshPublicKeyFile '%s' not found", ssh_public_key_file)
+ success = False
+ else:
+ LOG.info("sshPublicKeyFile '%s' found", ssh_public_key_file)
+ success = evaluate_ssh_public_key_file_security(ssh_public_key_file) and success
+ return success
+ def check_clouds_yamls(self):
+ """
+ Checks if every cloud in clouds_yaml is valid
+ @return: True if all clouds are valid
+ """
+ LOG.info("Checking cloud specifications...")
+ success = True
+ cloud_specifications = configuration_handler.get_cloud_specifications(self.configurations)
+ for index, cloud_specification in enumerate(cloud_specifications):
+ if not check_cloud_yaml(cloud_specification):
+ success = False
+ LOG.warning("Cloud specification %s is faulty. BiBiGrid understood %s.", index, cloud_specification)
+ success = check_clouds_yaml_security() and success
+ return success
+Alternative version of yaml.SafeDumper that ignores aliases.
+import yaml
+class NoAliasSafeDumper(yaml.SafeDumper):
+ """
+ Only difference to the regular yaml.SafeDumper class is that ignore_aliases is true
+ and therefore aliases are ignored.
+ """
+ def ignore_aliases(self, data):
+ return True
+""" module for additional exceptions """
+class ConnectionException(Exception):
+ """ Connection exception. """
+class ExecutionException(Exception):
+ """ Execution exception. """
+Expands threading.
+import threading
+class ReturnThread(threading.Thread):
+ """
+ Extends the Thread functionality:
+ - Return value of called function is returned by join()
+ - An exception occurred within the called function is raised by join()
+ """
+ def __init__(self, group=None, target=None, name=None, args=(), kwargs={}): # pylint: disable=dangerous-default-value
+ threading.Thread.__init__(self, group, target, name, args, kwargs)
+ self._return = None
+ self._exc = None
+ def run(self):
+ if self._target is not None:
+ try:
+ self._return = self._target(*self._args, **self._kwargs)
+ except Exception as exc: # pylint: disable=broad-except
+ self._exc = exc
+ def join(self, *args):
+ threading.Thread.join(self, *args)
+ if self._exc:
+ raise self._exc
+ return self._return
+Concrete implementation of provider.py for openstack
+import logging
+import keystoneclient
+import openstack
+from cinderclient import client
+from keystoneauth1 import session
+from keystoneauth1.exceptions.http import NotFound
+from keystoneauth1.identity import v3
+from bibigrid2.core import provider
+from bibigrid2.core.actions import create
+from bibigrid2.core.actions import version
+from bibigrid2.models.exceptions import ExecutionException
+LOG = logging.getLogger("bibigrid")
+class OpenstackProvider(provider.Provider): # pylint: disable=too-many-public-methods
+ """
+ Specific implementation of the Provider class for openstack
+ """
+ NAME = "OpenstackProvider"
+ # to be read from clouds.yaml file.
+ def __init__(self, cloud_specification):
+ super().__init__(cloud_specification)
+ self.conn = self.create_connection()
+ sess = self.create_session()
+ self.keystone_client = keystoneclient.client.Client(session=sess, interface='public')
+ self.cinder = client.Client(3, session=sess)
+ def create_session(self, app_name="openstack_scripts", app_version="1.0"):
+ """
+ Creates and returns a session that can be used to create a connection to different openstack services
+ @param app_name:
+ @param app_version:
+ @return: session
+ """
+ # print(v3)
+ auth = self.cloud_specification["auth"]
+ if all(key in auth for key in ["auth_url", "application_credential_id", "application_credential_secret"]):
+ auth_session = v3.ApplicationCredential(
+ auth_url=auth["auth_url"],
+ application_credential_id=auth["application_credential_id"],
+ application_credential_secret=auth["application_credential_secret"]
+ )
+ elif all(key in auth for key in ["auth_url", "username", "password", "project_id", "user_domain_name"]):
+ auth_session = v3.Password(auth_url=auth["auth_url"],
+ username=auth["username"],
+ password=auth["password"],
+ project_id=auth["project_id"],
+ user_domain_name=auth["user_domain_name"])
+ else:
+ raise KeyError("Not enough authentication information in clouds.yaml/clouds-public.yaml "
+ "to create a session. Use one:\n"
+ "Application Credentials: auth_url, application_credential_id and "
+ "application_credential_secret\n"
+ "Password: auth_url, username, password, project_id and user_domain_name")
+ return session.Session(auth=auth_session,
+ app_name=app_name, app_version=app_version)
+ def create_connection(self, app_name="openstack_bibigrid", app_version=version.__version__):
+ auth = self.cloud_specification["auth"]
+ return openstack.connect(
+ load_yaml_config=False,
+ load_envvars=False,
+ auth_url=auth["auth_url"],
+ project_name=auth.get("project_name"),
+ username=auth.get("username"),
+ password=auth.get("password"),
+ region_name=self.cloud_specification["region_name"],
+ user_domain_name=auth.get("user_domain_name"),
+ project_domain_name=auth.get("user_domain_name"),
+ app_name=app_name,
+ app_version=app_version,
+ application_credential_id=auth.get("application_credential_id"),
+ application_credential_secret=auth.get("application_credential_secret"),
+ interface=self.cloud_specification.get("interface"),
+ identity_api_version=self.cloud_specification.get("identity_api_version"),
+ auth_type=self.cloud_specification.get("auth_type")
+ )
+ def create_application_credential(self, name=None):
+ return self.keystone_client.application_credentials.create(name=name).to_dict()
+ def delete_application_credential_by_id_or_name(self, ac_id_or_name):
+ """
+ Deletes existing application credential by id or name and returns true.
+ If application credential not found it returns false.
+ :param ac_id_or_name: application credential id or name
+ :return: True if deleted else false
+ """
+ try:
+ self.keystone_client.application_credentials.delete(ac_id_or_name) # id
+ return True
+ except NotFound:
+ try:
+ self.keystone_client.application_credentials.delete(
+ self.keystone_client.application_credentials.find(name=ac_id_or_name)) # name
+ return True
+ except NotFound:
+ return False
+ def get_image_by_id_or_name(self, image_id_or_name):
+ return self.conn.get_image(name_or_id=image_id_or_name)
+ def get_flavor(self, instance_type):
+ return self.conn.get_flavor(instance_type)
+ def get_volume_snapshot_by_id_or_name(self, snapshot_id_or_name):
+ return self.conn.get_volume_snapshot(name_or_id=snapshot_id_or_name)
+ def get_network_by_id_or_name(self, network_id_or_name):
+ return self.conn.get_network(name_or_id=network_id_or_name)
+ def get_subnet_by_id_or_name(self, subnet_id_or_name):
+ return self.conn.get_subnet(name_or_id=subnet_id_or_name)
+ def list_servers(self):
+ return [elem.toDict() for elem in self.conn.list_servers()]
+ def create_server(self, name, flavor, image,
+ network, key_name=None, wait=True, volumes=None):
+ try:
+ server = self.conn.create_server(name=name, flavor=flavor, image=image,
+ network=network, key_name=key_name, volumes=volumes)
+ except openstack.exceptions.BadRequestException as exc:
+ raise ConnectionError() from exc
+ except openstack.exceptions.SDKException as exc:
+ raise ExecutionException() from exc
+ except AttributeError as exc:
+ raise ExecutionException("Unable to create server due to faulty configuration.") from exc
+ if wait:
+ self.conn.wait_for_server(server=server, auto_ip=False, timeout=600)
+ server = self.conn.get_server(server["id"])
+ return server
+ def delete_server(self, name_or_id, delete_ips=True):
+ """
+ Deletes server. floating_ip as well if delete_ips is true. The resources are then free again
+ :param name_or_id:
+ :param delete_ips:
+ :return:
+ """
+ return self.conn.delete_server(name_or_id=name_or_id, wait=False,
+ timeout=180, delete_ips=delete_ips,
+ delete_ip_retry=1)
+ def delete_keypair(self, key_name):
+ return self.conn.delete_keypair(key_name)
+ def get_server_group_by_id_or_name(self, server_group_id_or_name):
+ return self.conn.get_server_group(name_or_id=server_group_id_or_name)
+ def close(self):
+ return self.conn.close()
+ def create_keypair(self, name, public_key):
+ return self.conn.create_keypair(name=name, public_key=public_key)
+ def get_network_id_by_subnet(self, subnet):
+ subnet = self.conn.get_subnet(subnet)
+ return subnet["network_id"] if subnet else subnet
+ def get_subnet_ids_by_network(self, network):
+ network = self.conn.get_network(network)
+ return network["subnets"] if network else network
+ def get_free_resources(self):
+ """
+ Uses the cinder API to get all relevant volume resources.
+ https://github.com/openstack/python-cinderclient/blob/master/cinderclient/v3/limits.py
+ Uses the nova API to get all relevant compute resources. Floating-IP is not returned correctly by openstack.
+ :return: Dictionary containing the free resources
+ """
+ compute_limits = dict(self.conn.compute.get_limits()["absolute"])
+ # maybe needs limits.get(os.environ["OS_PROJECT_NAME"]) in the future
+ volume_limits_generator = self.cinder.limits.get().absolute
+ volume_limits = {absolut_limit.name: absolut_limit.value for absolut_limit in
+ volume_limits_generator}
+ # ToDo TotalVolumeGigabytes needs totalVolumeGigabytesUsed, but is not given
+ volume_limits["totalVolumeGigabytesUsed"] = 0
+ free_resources = {}
+ for key in ["total_cores", "floating_ips", "instances", "total_ram"]:
+ free_resources[key] = compute_limits[key] - compute_limits[key + "_used"]
+ for key in ["Volumes", "VolumeGigabytes", "Snapshots", "Backups", "BackupGigabytes"]:
+ free_resources[key] = volume_limits["maxTotal" + key] - volume_limits[
+ "total" + key + "Used"]
+ return free_resources
+ def get_volume_by_id_or_name(self, name_or_id):
+ return self.conn.get_volume(name_or_id)
+ def create_volume_from_snapshot(self, snapshot_name_or_id):
+ """
+ Uses the cinder API to create a volume from snapshot:
+ https://github.com/openstack/python-cinderclient/blob/master/cinderclient/v3/volumes.py
+ :param snapshot_name_or_id: name or id of snapshot
+ :return: id of created volume
+ """
+ LOG.debug("Trying to create volume from snapshot")
+ snapshot = self.conn.get_volume_snapshot(snapshot_name_or_id)
+ if snapshot:
+ LOG.debug(f"Snapshot {snapshot_name_or_id} found.")
+ if snapshot["status"] == "available":
+ LOG.debug("Snapshot %s is available.", {snapshot_name_or_id})
+ size = snapshot["size"]
+ name = create.PREFIX_WITH_SEP + snapshot["name"]
+ description = f"Created from snapshot {snapshot_name_or_id} by BiBiGrid"
+ volume = self.cinder.volumes.create(size=size, snapshot_id=snapshot["id"], name=name,
+ description=description)
+ return volume.to_dict()["id"]
+ LOG.warning("Snapshot %s is %s; must be available.", snapshot_name_or_id, snapshot['status'])
+ else:
+ LOG.warning("Snapshot %s not found.", snapshot_name_or_id)
+ return None
+ def get_external_network(self, network_name_or_id):
+ """
+ Finds router interface with network id equal to given network and by that the external network.
+ :param network_name_or_id:Name or id of network
+ :return:Corresponding external network
+ """
+ network_id = self.conn.get_network(network_name_or_id)["id"]
+ for router in self.conn.list_routers():
+ for interface in self.conn.list_router_interfaces(router):
+ if interface.network_id == network_id:
+ return router.external_gateway_info["network_id"]
+ return None
+ def attach_available_floating_ip(self, network=None, server=None):
+ """
+ Get a floating IP from a network or a pool and attach it to the server
+ :param network:
+ :param server:
+ :return:
+ """
+ floating_ip = self.conn.available_floating_ip(network=network)
+ if server:
+ self.conn.compute.add_floating_ip_to_server(server, floating_ip["floating_ip_address"])
+ return floating_ip
+ def get_images(self):
+ """
+ Get a generator able ot generate all images
+ @return: A generator able ot generate all images
+ """
+ return self.conn.compute.images()
+ def get_flavors(self):
+ """
+ Get a generator able ot generate all flavors
+ @return: A generator able ot generate all flavors
+ """
+ return self.conn.compute.flavors()
+# BiBiGrid Features
+| Name | Purpose |
+| [Version](features/version.md) | Returns BiBiGrid's version for opening issues and the like |
+| [Terminate Cluster](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. |
+| [Create](features/create.md) | Creates the cluster specified by the configuration. |
+ | [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. |
+| [Check](features/check.md) | Checks if given configuration is valid and necessary security measures are taken. |
+| [Web IDE](features/ide.md) | Connects to running IDE of cluster-id's cluster. Requires that given cluster was setup with an ide. |
+| [Update](features/update.md) | Updates the master's playbook and runs that playbook for the master. Requires that no job is running and no workers up. |
+| [Cloud Specification Data](features/cloud_specification_data.md) | Contains necessary data to establish a general connection to the provider. |
+ | [Configuration](features/configuration.md) | Contains all data regarding cluster setup for all providers. |
+| [Command Line Interface](features/CLI.md) | What command line arguments can be passed into BiBiGrid. |
\ No newline at end of file
+# BiBiGrid Used Software
+| Name | Purpose | Official Link |
+| [Ansible](software/ansible.md) | Ansible, an open source community project by Red Hat, enables the idempotent setup of servers. Ansible is used to **prepare** all cluster nodes. | [Getting started with Ansible](https://docs.ansible.com/ansible/latest/getting_started/index.html) |
+| [Slurm](software/slurm.md) | Slurm is an open source cluster management and job scheduling system. Slurm is used to **schedule** cluster nodes i.e. Slurm will start and shutdown nodes as needed. | [Quick Start User Guide](https://slurm.schedmd.com/quickstart.html) |
+| [Theia IDE](software/theia_ide.md) | Theia IDE is a Web IDE, build using the Theia Framework, that allows easy, intuitive and abstract **web access** to cluster nodes. Theia IDE is optional. | [Using "Theia" as an End User](https://theia-ide.org/docs/user_getting_started/) |
+| [Zabbix](software/zabbix.md) | Zabbix is an open source **monitoring** solution for networks, servers, clouds, applications and services. Zabbix is optional. | [What is Zabbix](https://www.zabbix.com/documentation/current/en/manual/introduction/about) |
\ No newline at end of file
+# CLI
+Available command line parameters:
+- `-h, --help` show help message and exit
+- `-v, --verbose` Increases output verbosity (can be of great use when cluster fails to start). `-v` adds more detailed info to the logfile, `-vv` adds debug information to the logfile.
+- `-d, --debug` Keeps cluster active in case of an error. Offers termination after successful create.
+- `-i , --config_input (required)` Path to YAML configurations file. Relative paths can be used and start at `~/.config/bibigrid`
+- `-cid , --cluster_id ` Cluster id is needed for ide and termination. If no cluster id is set, the last started cluster's id will be used (except for `list_clusters`).
+## Mutually exclusive actions: choose exactly one
+- `-V, --version` Displays version.
+- `-t, --terminate_cluster` Terminates cluster. Needs cluster-id set.
+- `-c, --create` Creates cluster.
+- `-l, --list_clusters` Lists all running clusters. If cluster-id is
+ set, will list this cluster in detail only.
+- `-ch, --check` Validates cluster configuration.
+- `-ide, --ide` Establishes a secured connection to ide.
+ Needs cluster-id set.
+- `-u, --update` Updates master's playbook. Needs cluster-id set, no job running and no workers powered up.
\ No newline at end of file
+# Check
\ No newline at end of file
+# Cloud Specification Data
+To access the cloud, authentication information is required. The BiBiGrid2 no longer uses environment variables, but a two file system instead.
+`clouds.yaml` and `clouds-public.yaml` can be placed in `~/.config/bibigrid/` or `/etc/bibigrid/` and will be loaded by BiBiGrid2 on execution.
+While you store your password and username in `clouds.yaml` (private), you can store all other information ready to share in `clouds-public.yaml` (shareable).
+However, all information can just be stored in `clouds.yaml`.
+Keys set in `clouds.yaml` will overwrite keys from `clouds-public.yaml`.
+## Openstack
+Be aware that the downloaded `clouds.yaml` file contains all information.
+OpenStack does not split information into `clouds.yaml` and `clouds-public.yaml` on its own.
+The example files show an example split.
+### Password Example
+Using the password `clouds.yaml` is easy. However, since passwords - unlike [Application Credentials](#application-credentials-example)
+don't have an expiration date, caution is advised.
+Move the downloaded file to `~/.config/bibigrid/` or `/etc/bibigrid/`.
+##### Password clouds.yaml
+ openstack:
+ profile: nameOfCloudsPublicYamlEntry
+ auth:
+ username: SamSampleman
+ password: SecurePassword
+##### Password clouds-public.yaml
+ nameOfCloudsPublicYamlEntry:
+ auth:
+ auth_url: https://somelink:someport
+ project_id: someProjectId
+ project_name: someProjectName
+ user_domain_name: someDomainName
+ region_name: someRegionName
+ interface: "public"
+ identity_api_version: 3
+### Application Credentials Example
+The following show, how an Application Credential can be created and the related `clouds.yaml` downloaded.
+Application Credentials are the preferred way of authentication since they do have an expiration date and
+their access can be limited.
+Move the downloaded file to `~/.config/bibigrid/` or `/etc/bibigrid/`.
+#### Application Credential clouds.yaml
+ openstack:
+ profile: nameOfCloudsPublicYamlEntry
+ auth:
+ application_credential_id: SomeID
+ application_credential_secret: SecureSecret
+#### Application Credential clouds-public.yaml
+ nameOfCloudsPublicYamlEntry:
+ auth:
+ auth_url: https://somelink:someport
+ region_name: SomeRegion
+ interface: "public"
+ identity_api_version: 3
+ auth_type: "v3applicationcredential"
\ No newline at end of file
+# Configuration
+The configuration file (often called `bibigrid.yml`) contains important information about cluster creation.
+The cluster configuration holds a list of configurations where each configuration is assigned to a specific provider
+(location). That allows cluster to stretch over multiple providers. The configuration file is best stored in
+`~/.config/bibigrid/` since BiBiGrid starts its relative search there.
+## Configuration List
+The first configuration is always the master's provider configuration.
+Only the first configuration is allowed to have a master key.
+Every following configuration describes a provider that is not the master's provider containing a number of worker and a
+vpnwkr (vpn worker). The vpnwkr is a worker with a floating IP. That allows the master - that knows all vpnwkrs to access
+all workers using the floating IP as an entry point into the other local networks. However, all that will be covered by
+an abstraction layer using a virtual network. Therefore, end users can work on a spread cluster without noticing it.
+### Master Provider Configuration
+As mentioned before, the first configuration has a master key. Apart from that it also holds all information that is -
+simply put - true over the entire cluster. We also call those keys global.
+Keys that belong only to a single provider configuration are called local.
+For example whether the master works alongside the workers is a general fact.
+Therefore, it is stored within the first configuration. The master provider configuration.
+## Keys
+### Global
+#### sshPublicKeyFiles (optional)
+`sshPublicKeyFiles` expects a list of public keyfiles to be registered on every node. That allows you to grant access to
+created clusters to the owners of the private keyfile. For example, you can add colleges public key to the list and allow
+him to access your started cluster later on to debug it.
+#### masterMounts (optional)
+`masterMounts` expects a list of volumes or snapshots that will then be mounted to the master. If any snapshots are
+given, the related volumes are first created and then those volumes are used by BiBiGrid. Those volumes are not deleted
+after Cluster termination.
+ What is mounting?
+[Mounting](https://man7.org/linux/man-pages/man8/mount.8.html) adds a new filesystem to the file tree allowing access.
+#### nfsShares (optional)
+`nfsShares` expects a list of folder paths to share using nfs. In every case, `/vol/spool/` is always an nfsShare.
+This key only makes sense if the [nfs key](#nfs) is set `True`.
+What is NFS?
+NFS (Network File System) is a stable and well-functioning network protocol for exchanging files over the local network.
+#### ansibleRoles (optional)
+Yet to be explained.
+ - file: SomeFile
+ hosts: SomeHosts
+ name: SomeName
+ vars: SomeVars
+ vars_file: SomeVarsFile
+#### ansibleGalaxyRoles (optional)
+Yet to be explained.
+ - hosts: SomeHost
+ name: SomeName
+ galaxy: SomeGalaxy
+ git: SomeGit
+ url: SomeURL
+ vars: SomeVars
+ vars_file: SomeVarsFile
+#### localFS (optional)
+This key helps some users to create a filesystem to their liking. It is not used in general.
+#### localDNSlookup (optional)
+If `True`, master will store the link to his workers. This is called
+[Local DNS Lookup](https://helpdeskgeek.com/networking/edit-hosts-file/).
+#### zabbix (optional)
+If `True`, the monitoring solution [zabbix](https://www.zabbix.com/) will be installed on the master.
+#### nfs (optional)
+If `True`, nfs is created.
+What is NFS?
+NFS (Network File System) is a stable and well-functioning network protocol for exchanging files over the local network.
+#### useMasterAsCompute (optional)
+Default the master always works together with the workers on submitted jobs. If you set `useMasterWithPublicIp`
+ to `False` the master will instead no longer support the workers.
+#### waitForServices (optional):
+Expects a list of services to wait for. This is required if your provider has any post-launch services. If not set,
+seemingly random errors can occur when the service interrupts the ansible execution. Providers and their services are
+listed on [de.NBI Wiki](https://cloud.denbi.de/wiki/) at `Computer Center Specific`.
+### Local
+#### infrastructure (required)
+`infrastructure` sets the used provider implementation for this configuration. Currently only `openstack` is available.
+Other infrastructures would be AWS and so on.
+#### cloud
+`cloud` decides which entry in the `clouds.yaml` is used.
+When using OpenStack the downloaded `clouds.yaml` is named `openstack`
+`cloud: openstack`
+#### workerInstances (optional)
+`workerInstances` expects a list of workers to be used on this specific provider the configuration is for.
+`Instances` are also called `servers`.
+ - type: de.NBI tiny
+ image: Ubuntu 22.04 LTS (2022-10-14)
+ count: 2
+- `type` sets the instance's hardware configuration. Also called `flavor` sometimes.
+- `image` sets the bootable operating system to be installed on the instance.
+- `count` sets how many workers of that `type` `image` combination are to be used by the cluster
+Find your active `images`:
+openstack image list --os-cloud=openstack | grep active
+Find your active `flavors`:
+openstack flavor list --os-cloud=openstack
+#### Master or vpnWorker?
+##### Master
+Only in the first configuration and only one:
+ masterInstance:
+ type: de.NBI tiny
+ image: Ubuntu 22.04 LTS (2022-10-14)
+##### vpnWorker:
+Exactly once in every configuration but the first:
+ vpnWorker:
+ type: de.NBI tiny
+ image: Ubuntu 22.04 LTS (2022-10-14)
+#### sshUser (required)
+`sshUser` is the standard user of the installed images. For `Ubuntu 22.04` this would be `ubuntu`.
+#### region (required)
+Every [region](https://docs.openstack.org/python-openstackclient/rocky/cli/command-objects/region.html) has its own
+openstack deployment. Every [avilability zone](#availabilityzone-required) belongs to a region.
+Find your `regions`:
+openstack region list --os-cloud=openstack
+#### availabilityZone (required)
+[availability zones](https://docs.openstack.org/nova/latest/admin/availability-zones.html) allow to logically group
+Find your `availabilityZones`:
+openstack region list --os-cloud=openstack
+#### subnet (required)
+`subnet` is a block of ip addresses.
+Find available `subnets`:
+openstack subnet list --os-cloud=openstack
+#### localDNSLookup (optional)
+If no full DNS service for started instances is available, set `localDNSLookup: True`.
+Currently the case in Berlin, DKFZ, Heidelberg and Tuebingen.
\ No newline at end of file
+# Create
+Temporary cluster keys will be stored in `~/.config/bibigrid/keys`.
\ No newline at end of file
+# Web IDE
+# List Clusters
\ No newline at end of file
+# Terminate Cluster
\ No newline at end of file
+# Update
\ No newline at end of file
+# Version
\ No newline at end of file
+# Ansible
+## Ansible Tutorial
+- [Ansible Workshop Presentation](https://docs.google.com/presentation/d/1W4jVHLT8dB1VsdtxXqtKlMqGbeyEWTQvSHh0WMfWo2c/edit#slide=id.p10)
+- [de.NBI Cloud's Ansible Course](https://gitlab.ub.uni-bielefeld.de/denbi/ansible-course)
+## Executing BiBiGrid's Playbook Manually
+Only execute BiBiGrid's playbook manually when no worker is up. The playbook is executed automatically for workers powering up.
+If you've implemented changes to BiBiGrid's playbook, you might want to execute BiBiGrid's playbook manually to see how
+those changes play out. For this we need the preinstalled `bibigrid-playbook` command. However, BiBiGrid has a handy
+shortcut for that called `bibiplay`.
+### bibiplay
+To make things easier we wrote the [bibiplay](..%2F..%2F..%2Fresources%2Fbin%2Fbibiplay) wrapper. It's used like this:
+is the same as:
+ansible-playbook /opt/playbook/site.yml /opt/playbook/ansible_hosts/
+any additional arguments are passed to `ansible-playbook`:
+bibiplay -l master
+is the same as:
+ansible-playbook /opt/playbook/site.yml /opt/playbook/ansible_hosts/ -l master
+### Useful commands
+For more options see [ansible-playbook's manpage](https://linux.die.net/man/1/ansible-playbook).
+| Summary | Command |
+| Prepare master manually | `bibiplay -l master` |
+| Prepare only slurm on master manually | `bibiplay -l master -t slurm` |
diff --git a/documentation/markdown/software/slurm.md b/documentation/markdown/software/slurm.md
+# Slurm
+Be aware that due to BiBiGrid's slurm configuration the default behavior of commands might differ slightly from slurm's defaults.
+Everything described below explains how slurm will behave in BiBiGrid's context.
+## Slurm Client
+### Useful commands
+For more options see [slurm client's manpage](https://manpages.debian.org/testing/slurm-client/slurm-wlm.1).
+| Summary | Command | Explanation & Comment |
+| List all present nodes | `sinfo` | Cloud nodes that are powered down are marked`~`. Knowing [Node State Codes](https://manpages.debian.org/testing/slurm-client/sinfo.1.en.html#NODE_STATE_CODES) helps a lot. |
+| Shutdown an instance | `sudo scontrol update NodeName=[node-name] state=POWER_DOWN reason=[reason]` | Powers down the node. The instance will be deleted. |
+| Powerup an instance | `sudo scontrol update NodeName=[node-name] state=POWER_UP reason=[reason]` | Powers up the node. An instance will be created. |
+| Lists all running jobs | `squeue` | Allows you to see whether everything runs as expected. |
+### Read more
+| Summary | Explanation |
+| [NODE STATE CODES](https://slurm.schedmd.com/sinfo.html#SECTION_NODE-STATE-CODES) | Very helpful to interpret `sinfo` correctly. |
diff --git a/documentation/markdown/software/theia_ide.md b/documentation/markdown/software/theia_ide.md
+# Theia IDE
+[Theia Web IDE's](https://www.theia-ide.org/) many features make it easier to work on your cloud instances.
+## Installing Python Syntax Highlighter
diff --git a/documentation/markdown/software/zabbix.md b/documentation/markdown/software/zabbix.md
+# Zabbix
+exe() { echo "\$" "$@" ; "$@" ; }
+echo "Hello, World! This program will show very basic slurm scheduling."
+echo "I) Only execute this just after logging in and without any prior changes"
+echo "II) You need to have at least one worker in your configuration or this program will hang at some point."
+echo "III) The master should be configured to work as well or this program will hang at some point."
+read -n 1 -r -s -p $'Press enter to continue...\n'
+echo "Let's see which servers are up using sinfo (slurm info)!"
+exe sinfo
+echo -e "\nOnly the master is up, since all other workers are configured, but not powered up ('~' is used for nodes that are powered down)."
+echo "See here for more info about node states: https://slurm.schedmd.com/sinfo.html#SECTION_NODE-STATE-CODES"
+read -n 1 -r -s -p $'Press enter to continue...\n'
+echo -e "\nLet's execute the 'hostname' command:"
+exe srun hostname
+echo -e "\nAnd see if a server started"
+exe sinfo
+echo -e "\nSince the master is a worker, too, no need to start new workers."
+read -n 1 -r -s -p $'Press enter to continue...\n'
+echo -e "\nWhat if we need another server? Let's exclude $(hostname) for now using (-x node-name-to-exclude), so slurm has to power up a worker node."
+echo "While it starts, open another terminal and execute 'squeue'. That will show you the running job."
+echo "Also execute 'sinfo' that will show you the node is powering up ('#' is used for nodes that are powering up). But now let's start another node:"
+start_time=$(date +%T)
+exe srun -x "$(hostname)" hostname
+echo "We triggered the power up at: $(date +%T). Now it's $start_time."
+echo -e "\nLet's see what changed."
+exe sinfo
+echo "Now a worker powered up as we can see looking at 'sinfo'"
+read -n 1 -r -s -p $'Press enter to continue...\n'
+echo -e "\nWorkers that are not used will be shut down after a while."
diff --git a/resources/bin/bibiplay b/resources/bin/bibiplay
+# allows for an easier execution of the ansible playbook no matter where you are
+ansible-playbook /opt/playbook/site.yml -i /opt/playbook/ansible_hosts "$@"
\ No newline at end of file
+# This file is moved programmatically to /etc/ansible/ansible.cfg on the master so it shouldn't be moved manually
+inventory = ./ansible_hosts
+host_key_checking = False
+pipelining = True
+timeout = 60
+ssh_args = -o ControlMaster=auto -o ControlPersist=60s
\ No newline at end of file
+ role_name: Hello-World Example
+ author: Tim Dilger
+ description: Shows working example of installing Ansible Role.
+ company: Bielefeld university, CeBiTec, BiBiServ
+ license: BSD
+ min_ansible_version: 2.7
+ platforms:
+ - name: EL
+ versions:
+ - 7
+ - name: Debian
+ versions:
+ - stretch
+ - name: Ubuntu
+ versions:
+ - xenial
+ - bionic
+ galaxy_tags:
+ - hello-world
+dependencies: []
+ # List your role dependencies here, one per line. Be sure to remove the '[]' above,
+ # if you add dependencies to this list.
+- debug:
+ msg:
+ - "Hello {{ ansible_user }}!"
+nvm_install_dir: /opt/nvm
+theia_version: "next"
+theia_ide_install_dir: /opt/theia-ide
+theia_ide_bind_address: localhost
+theia_ide_bind_port: 8181
+APT::Periodic::Update-Package-Lists "0";
+APT::Periodic::Download-Upgradeable-Packages "0";
+APT::Periodic::AutocleanInterval "0";
+APT::Periodic::Unattended-Upgrade "0";
\ No newline at end of file
\ No newline at end of file
\ No newline at end of file
+# redirect stderr and stdout
+exec >> /var/log/slurm/create.out.log
+exec 2>> /var/log/slurm/create.err.log
+hosts=$(scontrol show hostnames "$1")
+# create and configure requested instances
+python3 /usr/local/bin/create_server.py "${hosts}"
+exit $?
\ No newline at end of file
+#!/usr/bin/env python3
+Creates one or more instances from comma separated name list.
+Is called automatically by create.sh (called by slurm user automatically) which sources a virtual environment.
+import logging
+import math
+from openstack.exceptions import OpenStackCloudException
+import re
+import sys
+import time
+import ansible_runner
+import os_client_config
+import paramiko
+import yaml
+LOGGER_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
+logging.basicConfig(format=LOGGER_FORMAT, filename="/var/log/slurm/create_server.log", level=logging.INFO)
+logging.info("create_server.py started")
+start_time = time.time()
+def check_ssh_active(private_ip, private_key="/opt/slurm/.ssh/id_ecdsa", username="ubuntu", timeout=5):
+ """
+ Waits until SSH connects successful. This guarantees that the node can be reached via Ansible.
+ @param private_ip: ip of node
+ @param private_key: private ssh key
+ @param username: username of node
+ @param timeout: how long to try
+ @return:
+ """
+ # Wait for SSH Connection available
+ paramiko_key = paramiko.ECDSAKey.from_private_key_file(private_key)
+ with paramiko.SSHClient() as client:
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ attempts = 0
+ establishing_connection = True
+ while establishing_connection:
+ try:
+ client.connect(hostname=private_ip, username=username, pkey=paramiko_key)
+ establishing_connection = False
+ except paramiko.ssh_exception.NoValidConnectionsError as exc:
+ logging.info("Attempting to connect to %s... This might take a while", private_ip)
+ if attempts < timeout:
+ time.sleep(2 ** attempts)
+ attempts += 1
+ else:
+ logging.warning("Attempt to connect to %s failed.", private_ip)
+ raise ConnectionError from exc
+def run_playbook(run_instances):
+ """
+ Runs the BiBiGrid playbook for run_instances
+ @param run_instances: instances to run the playbook for
+ @return:
+ """
+ logging.info("run_playbook with \ninstances: %s", run_instances)
+ # cmdline_args = ["/opt/playbook/site.yml", '-i', '/opt/playbook/ansible_hosts', '-vvvv', '-l', instances]
+ cmdline_args = ["/opt/playbook/site.yml", '-i', '/opt/playbook/ansible_hosts', '-l', ",".join(instances)]
+ executable_cmd = '/usr/local/bin/ansible-playbook'
+ logging.info(f"run_command...\nexecutable_cmd: {executable_cmd}\ncmdline_args: {cmdline_args}")
+ runner = ansible_runner.interface.init_command_config(
+ executable_cmd=executable_cmd,
+ cmdline_args=cmdline_args)
+ runner.run()
+ runner_response = runner.stdout.read()
+ runner_error = runner.stderr.read()
+ return runner, runner_response, runner_error, runner.rc
+if len(sys.argv) < 2:
+ logging.warning("usage: $0 instance1_name[,instance2_name,...]")
+ logging.info("Your input % with length %s", sys.argv, len(sys.argv))
+ sys.exit(1)
+sdk = os_client_config.make_sdk(cloud="master")
+# read instances configuration
+with open("/opt/playbook/vars/instances.yml", mode="r") as f:
+ worker_types = yaml.safe_load(f)
+# read common configuration
+with open("/opt/playbook/vars/common_configuration.yml", mode="r") as f:
+ common_config = yaml.safe_load(f)
+instances = sys.argv[1].split("\n")
+logging.info("Instances: %s", instances)
+server_list = []
+openstack_exception_list = []
+# Iterate over all names and search for a fitting ...
+for worker in instances:
+ # ... worker_type
+ for worker_type in worker_types["workers"]:
+ if re.match(worker_type["regexp"], worker):
+ try:
+ logging.info("Create server %s.", worker)
+ # create server and ...
+ server = sdk.create_server(
+ name=worker,
+ flavor=worker_type["flavor"]["name"],
+ image=worker_type["image"],
+ network=worker_type["network"],
+ key_name=f"tempKey_bibi-{common_config['cluster_id']}",
+ wait=False)
+ # ... add it to server
+ server_list.append(server)
+ # ToDo Better handling, Check edge cases, ...
+ except OpenStackCloudException as exc:
+ logging.warning("While creating %s the OpenStackCloudException %s occurred. Worker ignored.",
+ worker, exc)
+ openstack_exception_list.append(worker)
+# ToDo implement better error handling
+no_ssh_list = []
+return_list = []
+openstack_wait_exception_list = []
+for server in server_list:
+ try:
+ sdk.wait_for_server(server, auto_ip=False, timeout=600)
+ server = sdk.get_server(server["id"])
+ except OpenStackCloudException as exc:
+ logging.warning("While creating %s the OpenStackCloudException %s occurred.", worker, exc)
+ openstack_wait_exception_list.append(server.name)
+ continue
+ logging.info("%s is active. Checking ssh", server.name)
+ try:
+ check_ssh_active(server.private_v4)
+ logging.info(f"Server {server.name} is {server.status}.")
+ return_list.append(server.name)
+ except ConnectionError as exc:
+ logging.warning(f"{exc}: Couldn't connect to {server.name}.")
+ no_ssh_list.append(server.name)
+# If no suitable server can be started: abort
+if len(return_list) == 0:
+ logging.warning("No suitable server found! Abort!")
+ exit(1)
+logging.info("Call Ansible to configure instances.")
+# run ansible
+# ToDo: use https://ansible-runner.readthedocs.io/en/latest/ instead of subprocess
+runnable_instances = ",".join(return_list)
+r, response, error, rc = run_playbook(runnable_instances)
+logging.info("Ansible executed!")
+unreachable_list = list(r.stats["dark"].keys())
+failed_list = list(r.stats["failures"].keys())
+overall_failed_list = unreachable_list + failed_list + no_ssh_list + openstack_wait_exception_list
+if overall_failed_list or openstack_exception_list:
+ logging.warning(f"Openstack exception list: {openstack_exception_list}")
+ logging.warning(f"Unable to connect via ssh list: {no_ssh_list}")
+ logging.warning(f"Unreachable list: {unreachable_list}")
+ logging.warning(f"Failed list: {failed_list}")
+ logging.warning(f"Return code: {rc}")
+ for server_name in overall_failed_list:
+ logging.warning(f"Deleting server {server_name}: {sdk.delete_server(server_name)}")
+ logging.warning("Exit Code 1")
+ exit(1)
+logging.info("Successful create_server.py execution!")
+time_in_s = time.time() - start_time
+logging.info(f"--- %s minutes and %s seconds ---", math.floor(time_in_s / 60), time_in_s % 60)
+logging.info("Exit Code 0")
+# redirect stderr and stdout
+exec >> /var/log/slurm/fail.out.log
+exec 2>> /var/log/slurm/fail.err.log
+# $1 is in slurm node format for example: bibigrid-worker0-cid-[0-1],bibigrid-worker1-cid-0 and needs no converting
+scontrol update NodeName="$1" state=RESUME reason=FailedStartup # no sudo needed cause executed by slurm user
+exit $?
\ No newline at end of file
+# /etc/default/slurmrestd
+# Additional options that are passed to the slurmrestd daemon
\ No newline at end of file
+# Override systemd service ExecStart command to disable unixSocket of slurmrestd
+ExecStart=/usr/sbin/slurmrestd $SLURMRESTD_OPTIONS
\ No newline at end of file
+# redirect stderr and stdout
+exec >> /var/log/slurm/terminate.out.log
+exec 2>> /var/log/slurm/terminate.err.log
+function log {
+ echo "$(date) $*"
+log "Terminate invoked $0 $*"
+# extract all hosts from argumentlist
+hosts=$(scontrol show hostnames "$1")
+for host in $hosts
+ # ToDo: Implement better logging in case of an error
+ log "Delete instance ${host} from Zabbix host list."
+ python3 /usr/local/bin/zabbix_host_delete.py --pwd bibigrid "${host}"
+ log "Terminate instance ${host}"
+ openstack --os-cloud master server delete "${host}"
+ log "done"
+ BiBiGrid Overview