diff --git a/.ansible-lint b/.ansible-lint
new file mode 100644
index 000000000..973ff29f3
--- /dev/null
+++ b/.ansible-lint
@@ -0,0 +1,5 @@
+skip_list:
+ - 'fqcn-builtins'
+ - 'fqcn'
+ - 'name[missing]'
+ - 'name[template]'
diff --git a/.gitignore b/.gitignore
index 53354f0e6..bc44901af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,173 @@
-target/
-out/
-dependency-reduced-pom.xml
-*.ipr
-*.iws
-*.iml
-.idea/
-*.retry
+# complete idea
+.idea/
+
+# variable resources
+resources/playbook/site.yml
+resources/playbook/ansible_hosts
+resources/playbook/vars/instances.yml
+resources/playbook/vars/login.yml
+resources/playbook/vars/worker_specification.yml
+resources/playbook/vars/common_configuration.yml
+
+# any log files
+*.log
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 000000000..d3d3a2306
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,619 @@
+[MAIN]
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Load and enable all available extensions. Use --list-extensions to see a list
+# all available extensions.
+#enable-all-extensions=
+
+# In error mode, messages with a category besides ERROR or FATAL are
+# suppressed, and no reports are done by default. Error mode is compatible with
+# disabling specific errors.
+#errors-only=
+
+# Always return a 0 (non-error) status code, even if lint errors are found.
+# This is primarily useful in continuous integration scripts.
+#exit-zero=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+extension-pkg-whitelist=
+
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+fail-on=
+
+# Specify a score threshold to be exceeded before program exits with error.
+fail-under=10
+
+# Interpret the stdin as a python script, whose filename needs to be passed as
+# the module_or_package argument.
+#from-stdin=
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=.git
+
+# Add files or directories matching the regex patterns to the ignore-list. The
+# regex matches against paths and can be in Posix or Windows format.
+ignore-paths=
+
+# Files or directories matching the regex patterns are skipped. The regex
+# matches against base names, not paths. The default value ignores Emacs file
+# locks
+ignore-patterns=^\.#
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+jobs=0
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=no
+
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+py-version=3.10
+
+# Discover python modules and packages in the file system subtree.
+recursive=no
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# In verbose mode, extra non-checker-related info will be displayed.
+#verbose=
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+#output-format=
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
+# UNDEFINED.
+confidence=HIGH,
+ CONTROL_FLOW,
+ INFERENCE,
+ INFERENCE_FAILURE,
+ UNDEFINED
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=raw-checker-failed,
+ bad-inline-option,
+ locally-disabled,
+ file-ignored,
+ suppressed-message,
+ useless-suppression,
+ deprecated-pragma,
+ use-symbolic-message-instead,
+ missing-function-docstring,
+ import-error,
+ logging-fstring-interpolation,
+ too-many-arguments,
+ fixme
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+ XXX,
+ TODO
+
+# Regular expression of note tags to take in consideration.
+notes-rgx=
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of names allowed to shadow builtins
+allowed-redefined-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+ _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+
+
+[CLASSES]
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+ __new__,
+ setUp,
+ __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+ _fields,
+ _replace,
+ _source,
+ _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style. If left empty, argument names will be checked with the set
+# naming style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+ bar,
+ baz,
+ toto,
+ tutu,
+ tata,
+ test,
+ bla,
+ tmp
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
+#class-attribute-rgx=
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style. If left empty, class constant names will be checked with
+# the set naming style.
+#class-const-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style. If left empty, class names will be checked with the set naming style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style. If left empty, function names will be checked with the set
+# naming style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+ j,
+ k,
+ f,
+ ex,
+ Run,
+ _
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style. If left empty, method names will be checked with the set naming style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style. If left empty, module names will be checked with the set naming style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct type variable names. If left empty, type
+# variable names will be checked with the set naming style.
+#typevar-rgx=
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style. If left empty, variable names will be checked with the set
+# naming style.
+#variable-rgx=
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=
+
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
+ext-import-graph=
+
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
+import-graph=
+
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[SIMILARITIES]
+
+# Comments are removed from the similarity computation
+ignore-comments=yes
+
+# Docstrings are removed from the similarity computation
+ignore-docstrings=yes
+
+# Imports are removed from the similarity computation
+ignore-imports=yes
+
+# Signatures are removed from the similarity computation
+ignore-signatures=yes
+
+# Minimum lines number of a similarity.
+min-similarity-lines=6
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of symbolic message names to ignore for Mixin members.
+ignored-checks-for-mixins=no-member,
+ not-async-context-manager,
+ not-context-manager,
+ attribute-defined-outside-init
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# Regex pattern to define which classes are considered mixins.
+mixin-class-rgx=.*[Mm]ixin
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. Available dictionaries: none. To make it work,
+# install the 'python-enchant' package.
+spelling-dict=
+
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[DESIGN]
+
+# List of regular expressions of class ancestor names to ignore when counting
+# public methods (see R0903)
+exclude-too-few-public-methods=
+
+# List of qualified class names to ignore when counting class parents (see
+# R0901)
+ignored-parents=
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when caught.
+overgeneral-exceptions=BaseException,
+ Exception
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit,argparse.parse_error
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )??$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string=' '
+
+# Maximum number of characters on a single line.
+max-line-length=120
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
diff --git a/README.md b/README.md
index e2d10c8ec..f48ce822d 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,116 @@
-# BiBiGrid2
+# BiBiGrid
+BiBiGrid is a cloud cluster creation and management framework for OpenStack (and more providers in the future).
-BiBiGrid is a tool for an easy cluster setup inside a cloud environment.
+> **Note**
+> The latest version is currently work in progress. Future changes are likely.
+> Not all features of the previous version are available, but they will come soon.
+> The [previous version](https://github.com/BiBiServ/bibigrid/tree/bibigrid-2.3.1) is still available,
+> but not maintained anymore.
+
+## Getting Started
+For most users the [Hands-On BiBiGrid Tutorial](https://github.com/deNBI/bibigrid_clum2022)
+is the best entry point.
+
+However, if you are already quite experienced with *OpenStack* and the previous *BiBiGrid* the following brief explanation
+might be just what you need.
+
+
+ Brief, technical BiBiGrid2 overview
+
+### How to configure a cluster?
+#### Configuration File: bibigrid.yml
+A [template](bibigrid.yml) file is included in the repository ([bibigrid.yml](bibigrid.yml)).
+
+The cluster configuration file consists of a list of configurations. Every configuration describes the provider specific configuration.
+The first configuration additionally contains all the keys that apply to the entire cluster (roles for example).
+Currently only clusters with one provider are possible, so focus only on the first configuration in the list.
+
+The configuration template [bibigrid.yml](bibigrid.yml) contains many helpful comments, making completing it easier for you.
+
+[You need more details?](documentation/markdown/features/configuration.md)
+
+#### Cloud Specification Data: clouds.yml
+To access the cloud, authentication information is required.
+You can download your `clouds.yaml` from OpenStack.
+
+Your `clouds.yaml` is to be placed in `~/.config/bibigrid/` and will be loaded by BiBiGrid2 on execution.
+
+[You need more details?](documentation/markdown/features/cloud_specification_data.md)
+
+### Quick First Time Usage
+If you haven't used BiBiGrid1 in the past or are unfamiliar with OpenStack, we heavily recommend following the
+[tutorial](https://github.com/deNBI/bibigrid_clum2022) instead.
+
+#### Preparation
+1. Download (or create) the `clouds.yaml` (and optionally `clouds-public.yaml`) file as described [above](#cloud-specification-data-cloudsyml).
+2. Place the `clouds.yaml` into `~/.config/bibigrid`
+3. Fill the configuration, `bibigrid.yml`, with your specifics. At least you need: A master instance with valid type and image,
+a region, an availability zone, an sshUser (most likely ubuntu) and a subnet.
+You probably also want at least one worker with a valid type, image and count.
+4. If your cloud provider runs post-launch services, you need to set the `waitForServices`
+key appropriately which expects a list of services to wait for.
+5. Create a virtual environment from `bibigrid2/requirements.txt`.
+See [here](https://www.akamai.com/blog/developers/how-building-virtual-python-environment) for more detailed info.
+6. Take a look at [First execution](#first-execution)
+
+#### First execution
+Before follow the steps described at [Preparation](#preparation).
+
+After cloning the repository navigate to `bibigrid2`.
+In order to execute BiBiGrid2 source the virtual environment created during [preparation](#preparation).
+Take a look at BiBiGrid2's [Command Line Interface](documentation/markdown/features/CLI.md)
+if you want to explore for yourself.
+
+A first execution run through could be:
+1. `./bibigrid.sh -i [path-to-bibigrid.yml] -ch`: checks the configuration
+2. `./bibigrid.sh -i 'bibigrid.yml -i [path-to-bibigrid.yml] -c'`: creates the cluster (execute only if check was successful)
+3. Use **BiBiGrid2's create output** to investigate the created cluster further. Especially connecting to the ide might be helpful.
+Otherwise, connect using ssh.
+4. While in ssh try `sinfo` to printing node info
+5. Run `srun -x $(hostname) hostname` to power up a worker and get its hostname.
+6. Run `sinfo` again to see the node powering up. After a while it will be terminated again.
+7. Use the terminate command from **BiBiGrid2's create output** to shut down the cluster again.
+All floating-ips used will be released.
+
+Great! You've just started and terminated your first cluster using BiBiGrid2!
+
+
+
+### Troubleshooting
+If your cluster doesn't start up, please first make sure your configurations file is valid (`-ch`).
+If it is not, try to modify the configurations file to make it valid. Use `-v` or `-vv` to get a more verbose output,
+so you can find the issue faster. Also double check if you have sufficient permissions to access the project.
+If you can't make your configurations file valid, please contact a developer.
+If that's the case, please contact a developer and/or manually check if your quotas are exceeded.
+Some quotas can currently not be checked by bibigrid.
+
+**Whenever you contact a developer, please send your logfile along.**
+
+# Documentation
+If you would like to learn more about BiBiGrid2 please follow a fitting link:
+- [BiBiGrid2 Features](documentation/markdown/bibigrid_feature_list.md)
+- [Software used by BiBiGrid2](documentation/markdown/bibigrid_software_list.md)
+
+
+ Differences to BiBiGrid1
+
+* BiBiGrid2 no longer uses RC- but cloud.yaml-files for cloud-specification data. Environment variables are no longer used (or supported).
+See [Cloud Specification Data](documentation/markdown/features/cloud_specification_data.md).
+* BiBiGrid2 has a largely reworked configurations file, because BiBiGrid2 core supports multiple providers this step was necessary.
+See [Configuration](documentation/markdown/features/configuration.md)
+* BiBiGrid2 currently only implements the provider OpenStack.
+* BiBiGrid2 only starts the master and will dynamically start workers using slurm when they are needed.
+Workers are powered down once they are not used for a longer period.
+* BiBiGrid2 lays the foundation for clusters that are spread over multiple providers, but Hybrid Clouds aren't fully implemented yet.
+
+
+# Development
+## Development-Guidelines
+
+[https://github.com/BiBiServ/Development-Guidelines](https://github.com/BiBiServ/Development-Guidelines)
+
+## On implementing concrete providers
+New concrete providers can be implemented very easily. Just copy the `provider.py` file and implement all methods for
+your cloud-provider. Also inherit from the `provider` class. After that add your provider to the providerHandler lists; giving it a associated name for the
+configuration files. By that, your provider is automatically added to BiBiGrid2's tests and regular execution. By testing
+your provider first, you will see whether all provider methods are implemented as expected.
\ No newline at end of file
diff --git a/bibigrid.sh b/bibigrid.sh
new file mode 100755
index 000000000..7739c57ad
--- /dev/null
+++ b/bibigrid.sh
@@ -0,0 +1 @@
+python3 -m bibigrid2.core.startup "$@"
\ No newline at end of file
diff --git a/bibigrid.yml b/bibigrid.yml
new file mode 100644
index 000000000..69f589079
--- /dev/null
+++ b/bibigrid.yml
@@ -0,0 +1,93 @@
+ # See https://cloud.denbi.de/wiki/Tutorials/BiBiGrid/ (after update)
+ # First configuration will be used for general cluster information and must include the master.
+ # All other configurations mustn't include another master, but exactly one vpnWorker instead (keys like master).
+
+- infrastructure: openstack # former mode. Describes what cloud provider is used (others are not implemented yet)
+ cloud: openstack # name of clouds.yaml cloud-specification key (which is value to top level key clouds)
+
+ # -- BEGIN: GENERAL CLUSTER INFORMATION --
+ ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself.
+ #sshPublicKeyFiles:
+ # - [key one]
+
+ ## Volumes and snapshots that will be mounted to master
+ #masterMounts: # KEY NOT FULLY IMPLEMENTED YET
+ # - [mount one]
+
+ #nfsShares: # KEY NOT FULLY IMPLEMENTED YET; /vol/spool/ is automatically created as a nfs
+ # - [nfsShare one]
+
+ ## Ansible (Galaxy) roles can be added for execution # KEY NOT IMPLEMENTED YET
+ #ansibleRoles:
+ # - file: SomeFile
+ # hosts: SomeHosts
+ # name: SomeName
+ # vars: SomeVars
+ # vars_file: SomeVarsFile
+
+ #ansibleGalaxyRoles: # KEY NOT IMPLEMENTED YET
+ # - hosts: SomeHost
+ # name: SomeName
+ # galaxy: SomeGalaxy
+ # git: SomeGit
+ # url: SomeURL
+ # vars: SomeVars
+ # vars_file: SomeVarsFile
+
+ ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen).
+ #useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated
+
+ # Other keys - default False
+ #localFS: True
+ #localDNSlookup: True
+ #zabbix: True
+ #nfs: True
+ #ide: True # Very useful to set on True. Use `./bibigrid.sh -i [path-to-bibigrid.yml] -ide -cid [cluster-id]` to start port forwarding to access the ide.
+
+ useMasterAsCompute: True # Currently ignored by slurm
+
+ #waitForServices: # existing service name that runs after an instance is launched. BiBiGrid's playbook will wait until service is "stopped" to avoid issues
+ # - de.NBI_Bielefeld_environment.service # uncomment for cloud site Bielefeld
+
+ # master configuration
+ masterInstance:
+ type: # existing type/flavor on your cloud. See launch instance>flavor for options
+ image: # existing image on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/images pick an active one. Currently only ubuntu22.04 is supported
+
+ # -- END: GENERAL CLUSTER INFORMATION --
+
+ # worker configuration
+ #workerInstances:
+ # - type: # existing type/flavor on your cloud. See launch instance>flavor for options
+ # image: # same as master
+ # count: # any number of workers you would like to create with set type, image combination
+
+ # Depends on cloud image
+ sshUser: # for example ubuntu
+
+ # Depends on cloud site:
+ # Berlin : regionOne
+ # Bielefeld : bielefeld
+ # DKFZ : regionOne
+ # Giessen : RegionOne
+ # Heidelberg : RegionOne
+ # Tuebingen : RegionOne
+ region: Bielefeld
+
+ # Depends on cloud site:
+ # Berlin : nova
+ # Bielefeld : default
+ # DKFZ : nova
+ # Giessen : nova
+ # Heidelberg : nova
+ # Tuebingen : nova
+ availabilityZone: default
+
+ # Depends on cloud site and project
+ subnet: # existing subnet on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/
+
+ # Uncomment if no full DNS service for started instances is available.
+ # Currently, the case in Berlin, DKFZ, Heidelberg and Tuebingen.
+ #localDNSLookup: True
+
+ #- [next configurations] # KEY NOT IMPLEMENTED YET
diff --git a/bibigrid2/core/actions/check.py b/bibigrid2/core/actions/check.py
new file mode 100644
index 000000000..41797ec18
--- /dev/null
+++ b/bibigrid2/core/actions/check.py
@@ -0,0 +1,20 @@
+"""
+Module that acts as a wrapper and uses validateConfiguration to validate given configuration
+"""
+import logging
+from bibigrid2.core.utility import validate_configuration
+
+LOG = logging.getLogger("bibigrid")
+
+def check(configurations, providers):
+ """
+ Uses validateConfiguration to validate given configuration.
+ :param configurations: list of configurations (dicts)
+ :param providers: list of providers
+ :return:
+ """
+ success = validate_configuration.ValidateConfiguration(configurations, providers).validate()
+ check_result = "succeeded! Cluster is ready to start." if success else "failed!"
+ print(f"Total check {check_result}")
+ LOG.info("Total check returned %s.", success)
+ return 0
diff --git a/bibigrid2/core/actions/create.py b/bibigrid2/core/actions/create.py
new file mode 100644
index 000000000..bd24ac3fe
--- /dev/null
+++ b/bibigrid2/core/actions/create.py
@@ -0,0 +1,362 @@
+"""
+The cluster creation (master's creation, key creation, ansible setup and execution, ...) is done here
+"""
+
+import logging
+import os
+import subprocess
+import threading
+import traceback
+from functools import partial
+
+import paramiko
+import yaml
+
+from bibigrid2.core.actions import terminate_cluster
+from bibigrid2.core.utility import ansible_configurator
+from bibigrid2.core.utility import id_generation
+from bibigrid2.core.utility.handler import ssh_handler
+from bibigrid2.core.utility.paths import ansible_resources_path as aRP
+from bibigrid2.core.utility.paths import bin_path as biRP
+from bibigrid2.models import exceptions
+from bibigrid2.models import return_threading
+from bibigrid2.models.exceptions import ExecutionException
+
+PREFIX = "bibigrid"
+SEPARATOR = "-"
+PREFIX_WITH_SEP = PREFIX + SEPARATOR
+LOG = logging.getLogger("bibigrid")
+
+
+def get_identifier(identifier, cluster_id, worker_group="", additional=""):
+ """
+ This method does more advanced string formatting to generate master, vpnwkr and worker names
+ @param identifier: master|vpnwkr|worker
+ @param cluster_id: id of cluster
+ @param worker_group: group of worker (every member of a group has same flavor/type and image)
+ @param additional: an additional string to be added at the end
+ @return: the generated string
+ """
+ general = PREFIX_WITH_SEP + identifier + str(worker_group) + SEPARATOR + cluster_id
+ if additional:
+ return general + SEPARATOR + str(additional)
+ return general
+
+
+MASTER_IDENTIFIER = partial(get_identifier, identifier="master", additional="")
+WORKER_IDENTIFIER = partial(get_identifier, identifier="worker")
+VPN_WORKER_IDENTIFIER = partial(get_identifier, identifier="vpnwkr")
+
+KEY_PREFIX = "tempKey_bibi"
+KEY_FOLDER = os.path.expanduser("~/.config/bibigrid/keys/")
+AC_NAME = "ac" + SEPARATOR + "{cluster_id}"
+KEY_NAME = KEY_PREFIX + SEPARATOR + "{cluster_id}"
+CLUSTER_MEMORY_FOLDER = KEY_FOLDER
+CLUSTER_MEMORY_FILE = ".bibigrid.mem"
+CLUSTER_MEMORY_PATH = os.path.join(CLUSTER_MEMORY_FOLDER, CLUSTER_MEMORY_FILE)
+
+
+class Create: # pylint: disable=too-many-instance-attributes,too-many-arguments
+ """
+ The class Create holds necessary methods to execute the Create-Action
+ """
+
+ def __init__(self, providers, configurations, config_path, debug=False):
+ """
+ Additionally sets (unique) cluster_id, public_key_commands (to copy public keys to master) and key_name.
+ Call create() to actually start server.
+ :param providers: List of providers (provider)
+ :param configurations: List of configurations (dict)
+ :param config_path: string that is the path to config-file
+ :param debug: Bool. If True Cluster will offer shut-down after create and
+ will ask before shutting down on errors
+ """
+ self.providers = providers
+ self.configurations = configurations
+ self.debug = debug
+ self.cluster_id = id_generation.generate_safe_cluster_id(providers)
+ self.ssh_user = configurations[0].get("sshUser") or "ubuntu"
+ self.ssh_add_public_key_commands = ssh_handler.get_add_ssh_public_key_commands(
+ configurations[0].get("sshPublicKeyFiles"))
+ self.config_path = config_path
+ self.master_ip = None
+ LOG.debug("Cluster-ID: %s", self.cluster_id)
+ self.name = AC_NAME.format(cluster_id=self.cluster_id)
+ self.key_name = KEY_NAME.format(cluster_id=self.cluster_id)
+ self.instance_counter = 0
+ self.thread_lock = threading.Lock()
+ self.use_master_with_public_ip = configurations[0].get("useMasterWithPublicIp", True)
+ LOG.debug("Keyname: %s", self.key_name)
+
+ def generate_keypair(self):
+ """
+ Generates ECDSA Keypair using system-function ssh-keygen and uploads the generated public key to providers.
+ generate_keypair makes use of the fact that files in tmp are automatically deleted
+ ToDo find a more pythonic way to create an ECDSA keypiar
+ See here for why using python module ECDSA wasn't successful
+ https://stackoverflow.com/questions/71194770/why-does-creating-ecdsa-keypairs-via-python-differ-from-ssh-keygen-t-ecdsa-and
+ :return:
+ """
+ # create KEY_FOLDER if it doesn't exist
+ if not os.path.isdir(KEY_FOLDER):
+ LOG.info("%s not found. Creating folder.", KEY_FOLDER)
+ os.mkdir(KEY_FOLDER)
+ # generate keyfile
+ res = subprocess.check_output(f'ssh-keygen -t ecdsa -f {KEY_FOLDER}{self.key_name} -P ""', shell=True).decode()
+ LOG.debug(res)
+ # read private keyfile
+ with open(f"{os.path.join(KEY_FOLDER, self.key_name)}.pub", mode="r", encoding="UTF-8") as key_file:
+ public_key = key_file.read()
+ # upload keyfiles
+ for provider in self.providers:
+ provider.create_keypair(name=self.key_name, public_key=public_key)
+
+ # write cluster_id to automatically read it on following calls if no cid is given
+ with open(CLUSTER_MEMORY_PATH, mode="w+", encoding="UTF-8") as cluster_memory_file:
+ yaml.safe_dump(data={"cluster_id": self.cluster_id}, stream=cluster_memory_file)
+
+ def start_instance(self, provider, identifier, instance_type, network, volumes=None,
+ external_network=None):
+ """
+ Starts any (master,worker,vpn) single server/instance in given network on given provider
+ with floating-ip if master or vpn and with volume if master.
+ :param provider: provider server will be started on
+ :param identifier: string MASTER/WORKER/VPN_IDENTIFIER
+ :param instance_type: dict from configuration containing server type, image and count (but count is not needed)
+ :param network: string network where server will be started in.
+ All server of a provider are started in the same network
+ :param volumes: list of volumes that are to be attached to the server. Currently only relevant for master
+ :param external_network: string only needed if worker=False to create floating_ip
+ :return:
+ """
+ # potentially weird counting due to master
+ with self.thread_lock:
+ if identifier == MASTER_IDENTIFIER: # pylint: disable=comparison-with-callable
+ name = identifier(cluster_id=self.cluster_id)
+ elif identifier == WORKER_IDENTIFIER: # pylint: disable=comparison-with-callable
+ name = identifier(number=self.instance_counter, cluster_id=self.cluster_id)
+ # else:
+ # name = identifier(number=self.instance_counter, cluster_id=self.cluster_id)
+ self.instance_counter += 1
+ LOG.info("Starting instance/server %s", name)
+ flavor = instance_type["type"]
+ image = instance_type["image"]
+ server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name,
+ image=image, network=network, volumes=volumes)
+ floating_ip = None
+ # pylint: disable=comparison-with-callable
+ if identifier == VPN_WORKER_IDENTIFIER or (
+ identifier == MASTER_IDENTIFIER and self.use_master_with_public_ip):
+ # wait seems to be included. Not in documentation
+ floating_ip = provider.attach_available_floating_ip(network=external_network,
+ server=server)["floating_ip_address"]
+ elif identifier == MASTER_IDENTIFIER:
+ floating_ip = provider.conn.get_server(server["id"])["private_v4"]
+ # pylint: enable=comparison-with-callable
+ return floating_ip
+
+ def start_instances(self, configuration, provider):
+ """
+ Starts all instances of a provider using multithreading
+ :param configuration: dict configuration of said provider
+ :param provider: provider
+ :return:
+ """
+ LOG.info("Starting instances on %s", provider.NAME)
+ # threads = []
+ identifier, instance_type, volumes = self.prepare_vpn_or_master_args(configuration, provider)
+ external_network = provider.get_external_network(configuration["network"])
+
+ # Starts master/vpn. Uses return threading to get floating_ip of master/vpn
+ vpn_or_master_thread = return_threading.ReturnThread(target=self.start_instance,
+ args=[provider,
+ identifier,
+ instance_type,
+ configuration["network"],
+ volumes,
+ external_network])
+ vpn_or_master_thread.start()
+
+ # Starts all workers
+ # for worker_instance_type in configuration.get("workerInstances") or []:
+ # for worker in range(worker_instance_type["count"]):
+ # worker_thread = threading.Thread(target=self.start_instance,
+ # args=[provider,
+ # WORKER_IDENTIFIER,
+ # worker_instance_type,
+ # configuration["network"],
+ # True])
+ # worker_thread.start()
+ # threads.append(worker_thread)
+ LOG.info("Waiting for servers to start-up on cloud %s", provider.cloud_specification['identifier'])
+ vpn_or_m_floating_ip_address = vpn_or_master_thread.join()
+ self.setup_reachable_servers(configuration, vpn_or_m_floating_ip_address)
+ # for thread in threads:
+ # thread.join()
+
+ def prepare_vpn_or_master_args(self, configuration, provider):
+ """
+ Prepares start_instance arguments for master/vpn
+ :param configuration: configuration (dict) of said master/vpn
+ :param provider: provider
+ :return: arguments needed by start_instance
+ """
+ if configuration.get("masterInstance"):
+ instance_type = configuration["masterInstance"]
+ identifier = MASTER_IDENTIFIER
+ master_mounts = configuration.get("masterMounts", [])
+ volumes = self.prepare_volumes(provider, master_mounts)
+ elif configuration.get("vpnInstance"):
+ instance_type = configuration["vpnInstance"]
+ identifier = VPN_WORKER_IDENTIFIER
+ volumes = [] # only master has volumes
+ else:
+ LOG.warning("Configuration %s has no vpnwkr or master and is therefore unreachable.", configuration)
+ raise KeyError
+ return identifier, instance_type, volumes
+
+ def setup_reachable_servers(self, configuration, vpn_or_m_floating_ip_address):
+ """
+ Executes necessary commands on master or vpnwkr
+ :param configuration: said configuration
+ :param vpn_or_m_floating_ip_address: floating_ip to master or vpnwkr
+ """
+ if configuration.get("masterInstance"):
+ self.master_ip = vpn_or_m_floating_ip_address
+ ssh_handler.ansible_preparation(floating_ip=vpn_or_m_floating_ip_address,
+ private_key=KEY_FOLDER + self.key_name,
+ username=self.ssh_user,
+ commands=self.ssh_add_public_key_commands)
+ elif configuration.get("vpnInstance"):
+ ssh_handler.execute_ssh(floating_ip=self.master_ip,
+ private_key=KEY_FOLDER + self.key_name,
+ username=self.ssh_user,
+ commands=ssh_handler.VPN_SETUP)
+
+ def prepare_volumes(self, provider, mounts):
+ """
+ Creates volumes from snapshots and returns all volumes (pre-existing and newly created)
+ :param provider: provider on which the volumes and snapshots exist
+ :param mounts: volumes or snapshots
+ :return: list of pre-existing and newly created volumes
+ """
+ LOG.info("Preparing volumes")
+ volumes = []
+ for mount in mounts:
+ volume_id = provider.get_volume_by_id_or_name(mount)["id"]
+ if volume_id:
+ volumes.append(volume_id)
+ else:
+ LOG.debug("Volume %s does not exist. Checking for snapshot.", mount)
+ volume_id = provider.create_volume_from_snapshot(mount)
+ if volume_id:
+ volumes.append(volume_id)
+ else:
+ LOG.warning("Mount %s is neither a snapshot nor a volume.", mount)
+ ret_volumes = set(volumes)
+ if len(ret_volumes) < len(volumes):
+ LOG.warning("Identical mounts found in masterMounts list. "
+ "Trying to set() to save the run. Check configurations!")
+ return ret_volumes
+
+ def prepare_configurations(self):
+ """
+ Makes sure that subnet and network key are set for each configuration.
+ If none is set a keyError will be raised and caught in create.
+ :return:
+ """
+ for configuration, provider in zip(self.configurations, self.providers):
+ if not configuration.get("network"):
+ configuration["network"] = provider.get_network_id_by_subnet(configuration["subnet"])
+ elif not configuration.get("subnet"):
+ configuration["subnet"] = provider.get_subnet_ids_by_network(configuration["network"])
+ configuration["sshUser"] = self.ssh_user # is used in ansibleConfigurator
+
+ def upload_data(self):
+ """
+ Configures ansible and then uploads the modified files and all necessary data to the master
+ :return:
+ """
+ if not os.path.isdir(aRP.VARS_FOLDER):
+ LOG.info("%s not found. Creating folder.", aRP.VARS_FOLDER)
+ os.mkdir(aRP.VARS_FOLDER)
+ ansible_configurator.configure_ansible_yaml(providers=self.providers,
+ configurations=self.configurations,
+ cluster_id=self.cluster_id)
+ ssh_handler.execute_ssh(floating_ip=self.master_ip, private_key=KEY_FOLDER + self.key_name,
+ username=self.ssh_user,
+ filepaths=[(aRP.PLAYBOOK_PATH, aRP.PLAYBOOK_PATH_REMOTE),
+ (biRP.BIN_PATH, biRP.BIN_PATH_REMOTE)],
+ commands=ssh_handler.ANSIBLE_START +
+ [ssh_handler.get_ac_command(self.providers[0], AC_NAME.format(
+ cluster_id=self.cluster_id))])
+
+ def start_start_instances_threads(self):
+ """
+ Starts for each provider a start_instances thread and joins them.
+ :return:
+ """
+ start_instances_threads = []
+ for configuration, provider in zip(self.configurations, self.providers):
+ start_instances_thread = return_threading.ReturnThread(target=self.start_instances,
+ args=[configuration, provider])
+ start_instances_thread.start()
+ start_instances_threads.append(start_instances_thread)
+ for start_instance_thread in start_instances_threads:
+ start_instance_thread.join()
+
+ def create(self):
+ """
+ Creates cluster and prints helpful cluster-info afterwards.
+ If debug is set True it offers termination after starting the cluster.
+ :return: exit_state
+ """
+ self.generate_keypair()
+ try:
+ self.prepare_configurations()
+ self.start_start_instances_threads()
+ self.upload_data()
+ self.print_cluster_start_info()
+ if self.debug:
+ LOG.info("DEBUG MODE: Entering termination...")
+ terminate_cluster.terminate_cluster(cluster_id=self.cluster_id, providers=self.providers,
+ debug=self.debug)
+ except exceptions.ConnectionException:
+ LOG.error("Connection couldn't be established. Check Provider connection.")
+ except paramiko.ssh_exception.NoValidConnectionsError:
+ LOG.error("SSH connection couldn't be established. Check keypair.")
+ except KeyError as exc:
+ LOG.error(f"Tried to access dictionary key {str(exc)}, but couldn't. Please check your configurations.")
+ except FileNotFoundError as exc:
+ LOG.error(f"Tried to access resource files but couldn't. No such file or directory: {str(exc)}")
+ except TimeoutError as exc:
+ LOG.error(f"Timeout while connecting to master. Maybe you are trying to create a master without "
+ f"public ip "
+ f"while not being in the same network: {str(exc)}")
+ except ExecutionException as exc:
+ if self.debug:
+ LOG.error(traceback.format_exc())
+ LOG.error(f"Execution of cmd on remote host fails: {str(exc)}")
+ except Exception as exc: # pylint: disable=broad-except
+ if self.debug:
+ LOG.error(traceback.format_exc())
+ LOG.error(f"Unexpected error: '{str(exc)}' ({type(exc)}) Contact a developer!)")
+ else:
+ return 0 # will be called if no exception occurred
+ terminate_cluster.terminate_cluster(cluster_id=self.cluster_id, providers=self.providers, debug=self.debug)
+ return 1
+
+ def print_cluster_start_info(self):
+ """
+ Prints helpful cluster-info:
+ SSH: How to connect to master via SSH
+ Terminate: What bibigrid2 command is needed to terminate the created cluster
+ Detailed cluster info: How to print detailed info about the created cluster
+ :return:
+ """
+ print(f"Cluster {self.cluster_id} with master {self.master_ip} up and running!")
+ print(f"SSH: ssh -i '{KEY_FOLDER}{self.key_name}' {self.ssh_user}@{self.master_ip}")
+ print(f"Terminate cluster: ./bibigrid.sh -i '{self.config_path}' -t -cid {self.cluster_id}")
+ print(f"Detailed cluster info: ./bibigrid.sh -i '{self.config_path}' -l -cid {self.cluster_id}")
+ if self.configurations[0].get("ide"):
+ print(f"IDE Port Forwarding: ./bibigrid.sh -i '{self.config_path}' -ide -cid {self.cluster_id}")
diff --git a/bibigrid2/core/actions/ide.py b/bibigrid2/core/actions/ide.py
new file mode 100644
index 000000000..d1877a826
--- /dev/null
+++ b/bibigrid2/core/actions/ide.py
@@ -0,0 +1,95 @@
+"""
+This module contains methods to establish port forwarding in order to access an ide (theia).
+"""
+
+import logging
+import random
+import re
+import signal
+import subprocess
+import sys
+import time
+import webbrowser
+import sshtunnel
+
+from bibigrid2.core.utility.handler import cluster_ssh_handler
+
+DEFAULT_IDE_WORKSPACE = "${HOME}"
+REMOTE_BIND_ADDRESS = 8181
+DEFAULT_IDE_PORT_END = 8383
+LOCAL_BIND_ADDRESS = 9191
+MAX_JUMP = 100
+LOCALHOST = "127.0.0.1"
+LOG = logging.getLogger("bibigrid")
+
+def sigint_handler(caught_signal, frame): # pylint: disable=unused-argument
+ """
+ Is called when SIGINT is thrown and terminates the program
+ @param caught_signal:
+ @param frame:
+ @return: 0
+ """
+ print("Exiting...")
+ sys.exit(0)
+signal.signal(signal.SIGINT, sigint_handler)
+
+
+def is_used(ip_address):
+ """
+ https://stackoverflow.com/questions/62000168/how-to-check-if-ssh-tunnel-is-being-used
+ :return:
+ """
+ ports_used = []
+ with subprocess.Popen(["netstat", "-na"], stdout=subprocess.PIPE) as process:
+ out = process.stdout.read()
+ lines = out.decode('utf-8').split('\n')
+ for line in lines:
+ is_open = re.match(rf'tcp.*{ip_address}:([0-9][0-9]*).*ESTABLISHED\s*$', line)
+ if is_open is not None:
+ print(line)
+ ports_used.append(is_open[1])
+
+
+def ide(cluster_id, master_provider, master_configuration):
+ """
+ Creates a port forwarding from LOCAL_BIND_ADDRESS to REMOTE_BIND_ADDRESS from localhost to master of specified
+ cluster
+ @param cluster_id: cluster_id or ip
+ @param master_provider: master's provider
+ @param master_configuration: master's configuration
+ @return:
+ """
+ LOG.info("Starting port forwarding for ide")
+ master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(cluster_id, master_provider,
+ master_configuration)
+ used_local_bind_address = LOCAL_BIND_ADDRESS
+ if master_ip and ssh_user and used_private_key:
+ attempts = 0
+ while attempts < 16:
+ attempts += 1
+ try:
+ with sshtunnel.SSHTunnelForwarder(
+ ssh_address_or_host=master_ip, # Raspberry Pi in my network
+
+ ssh_username=ssh_user,
+ ssh_pkey=used_private_key,
+
+ local_bind_address=(LOCALHOST, used_local_bind_address),
+ remote_bind_address=(LOCALHOST, REMOTE_BIND_ADDRESS)
+ ) as server:
+ print("CTRL+C to close port forwarding when you are done.")
+ with server:
+ # opens in existing window if any default program exists
+ webbrowser.open(f"http://localhost:{used_local_bind_address}", new=2)
+ while True:
+ time.sleep(5)
+ except sshtunnel.HandlerSSHTunnelForwarderError:
+ used_local_bind_address += random.randint(1, MAX_JUMP)
+ LOG.info("Attempt: %s. Port in use... Trying new port %s", attempts, used_local_bind_address)
+ if not master_ip:
+ LOG.warning("Cluster id %s doesn't match an existing cluster with a master.", cluster_id)
+ if not ssh_user:
+ LOG.warning("No ssh user has been specified in the first configuration.")
+ if not used_private_key:
+ LOG.warning("No matching sshPublicKeyFiles can be found in the first configuration or in .bibigrid")
+ return 1
diff --git a/bibigrid2/core/actions/list_clusters.py b/bibigrid2/core/actions/list_clusters.py
new file mode 100644
index 000000000..58f9924ae
--- /dev/null
+++ b/bibigrid2/core/actions/list_clusters.py
@@ -0,0 +1,152 @@
+"""
+This module contains methods to list all clusters or a specific cluster in a formatted, readable output.
+This includes a method to create a dictionary containing all running clusters and their servers.
+"""
+
+import logging
+import pprint
+import re
+
+from bibigrid2.core.actions import create
+
+SERVER_REGEX = re.compile(r"^bibigrid-((master)-([a-zA-Z0-9]+)|(worker|vpnwkr)\d+-([a-zA-Z0-9]+)-\d+)$")
+LOG = logging.getLogger("bibigrid")
+
+def dict_clusters(providers):
+ """
+ Creates a dictionary containing all servers by type and provider information
+ :param providers: list of all providers
+ :return: list of all clusters in yaml format
+ """
+ LOG.info("Creating cluster dictionary...")
+ cluster_dict = {}
+ for provider in providers:
+ servers = provider.list_servers()
+ for server in servers:
+ result = SERVER_REGEX.match(server["name"])
+ if result:
+ identifier = result.group(4) or result.group(2)
+ cluster_id = result.group(5) or result.group(3)
+ setup(cluster_dict, cluster_id, server, provider)
+ if identifier == "master":
+ cluster_dict[cluster_id][identifier] = server
+ else:
+ cluster_dict[cluster_id][identifier + "s"].append(server)
+ return cluster_dict # recursively converts munches in cluster_dict to dict
+
+
+def setup(cluster_dict, cluster_id, server, provider):
+ """
+ Determines cluster_id.
+ Generates empty entry for cluster_id in cluster_dict.
+ :param server: found server (dict)
+ :param cluster_id: id of said cluster
+ :param cluster_dict: dict containing all found servers by their cluster_id
+ :param provider: server's provider
+ :return: cluster_id
+ """
+ if not cluster_dict.get(cluster_id):
+ cluster_dict[cluster_id] = {}
+ cluster_dict[cluster_id]["workers"] = []
+ cluster_dict[cluster_id]["vpnwkrs"] = []
+ server["provider"] = provider.NAME
+ server["cloud_specification"] = provider.cloud_specification["identifier"]
+
+
+def print_list_clusters(cluster_id, providers):
+ """
+ Calls dict_clusters and gives a visual representation of the found cluster.
+ Detail depends on whether a cluster_id is given or not.
+ :param cluster_id:
+ :param providers:
+ :return:
+ """
+ cluster_dict = dict_clusters(providers=providers)
+ if cluster_id: # pylint: disable=too-many-nested-blocks
+ if cluster_dict.get(cluster_id):
+ LOG.info("Printing specific cluster_dictionary")
+ master_count, worker_count, vpn_count = get_size_overview(cluster_dict[cluster_id])
+ print(f"\tCluster has {master_count} master, {vpn_count} vpnwkr and {worker_count} regular workers. "
+ f"The cluster is spread over {vpn_count + master_count} reachable provider(s).")
+ pprint.pprint(cluster_dict[cluster_id])
+ else:
+ LOG.info("Cluster with cluster-id {cluster_id} not found.")
+ print(f"Cluster with cluster-id {cluster_id} not found.")
+ else:
+ LOG.info("Printing overview of cluster all clusters")
+ if cluster_dict:
+ for cluster_key_id, cluster_node_dict in cluster_dict.items():
+ print(f"Cluster-ID: {cluster_key_id}")
+ master = cluster_node_dict.get('master')
+ if master:
+ for key in ["name", "user_id", "launched_at", "key_name", "public_v4", "public_v6", "provider"]:
+ value = cluster_node_dict['master'].get(key)
+ if value:
+ print(f"\t{key}: {value}")
+ security_groups = get_security_groups(cluster_node_dict)
+ print(f"\tsecurity_groups: {security_groups}")
+ networks = get_networks(cluster_node_dict)
+ print(f"\tnetwork: {pprint.pformat(networks)}")
+ else:
+ LOG.warning("No master for cluster: %s.", cluster_key_id)
+ master_count, worker_count, vpn_count = get_size_overview(cluster_node_dict)
+ print(f"\tCluster has {master_count} master, {vpn_count} vpnwkr and {worker_count} regular workers. "
+ f"The cluster is spread over {vpn_count + master_count} reachable provider(s).")
+ else:
+ print("No cluster found.")
+ return 0
+
+
+def get_size_overview(cluster_dict):
+ """
+ :param cluster_dict: dictionary of cluster to size_overview
+ :return: number of masters, number of workers, number of vpns
+ """
+ LOG.info("Printing size overview")
+ master_count = int(bool(cluster_dict.get("master")))
+ worker_count = len(cluster_dict.get("workers") or "")
+ vpn_count = len(cluster_dict.get("vpnwkrs") or "")
+ return master_count, worker_count, vpn_count
+
+
+def get_networks(cluster_dict):
+ """
+ Gets all addresses of servers
+ :param cluster_dict: dictionary of clusters to find addresses
+ :return: dict containing addresses
+ """
+ master = cluster_dict["master"]
+ addresses = [{master["provider"]: list(master["addresses"].keys())}]
+ for server in (cluster_dict.get("vpnwkrs") or []):
+ addresses.append({server["provider"]: list(server["addresses"].keys())})
+ return addresses
+
+
+def get_security_groups(cluster_dict):
+ """
+ Gets all security group of servers
+ :param cluster_dict: dictionary of clusters to find security_groups
+ :return: dict containing security_groups
+ """
+ master = cluster_dict["master"]
+ security_groups = [{master["provider"]: master["security_groups"]}]
+ for server in (cluster_dict.get("vpnwkrs") or []):
+ security_groups.append({server["provider"]: server["security_groups"]})
+ return security_groups
+
+
+def get_master_access_ip(cluster_id, master_provider):
+ """
+ Returns master's ip of cluster cluster_id
+ :param master_provider: master's provider
+ :param cluster_id: Id of cluster
+ :return: public ip of master
+ """
+ LOG.info("Finding master ip for cluster %s...", cluster_id)
+ servers = master_provider.list_servers()
+ for server in servers:
+ master = create.MASTER_IDENTIFIER(cluster_id=cluster_id)
+ if server["name"].startswith(master):
+ return server.get("public_v4") or server.get("public_v6") or server.get("private_v4")
+ LOG.warning("Cluster %s not found on master_provider %s.", cluster_id, master_provider)
+ return None
diff --git a/bibigrid2/core/actions/terminate_cluster.py b/bibigrid2/core/actions/terminate_cluster.py
new file mode 100644
index 000000000..67f744dc8
--- /dev/null
+++ b/bibigrid2/core/actions/terminate_cluster.py
@@ -0,0 +1,173 @@
+"""
+This module contains methods to terminate a cluster. i.e. to delete all servers, keypairs (local and remote)
+and application credentials used by it.
+"""
+
+import logging
+import os
+import re
+
+from bibigrid2.core.actions import create
+LOG = logging.getLogger("bibigrid")
+
+def terminate_cluster(cluster_id, providers, debug=False):
+ """
+ Goes through all providers and gets info of all servers which name contains cluster ID.
+ It then checks if any resources are reserved, but not used and frees them that were hold by the cluster.
+ :param debug if set user gets asked before termination is executed
+ :param providers providers
+ :param cluster_id: ID of cluster to terminate
+ :return: VOID
+ """
+ if debug:
+ if not input(f"DEBUG MODE: Any non-empty input to shutdown cluster {cluster_id}. "
+ "Empty input to exit with cluster still alive:"):
+ return 0
+ cluster_server_state = []
+ cluster_keypair_state = []
+ tmp_keyname = create.KEY_NAME.format(cluster_id=cluster_id)
+ local_keypairs_deleted = delete_local_keypairs(tmp_keyname)
+ if local_keypairs_deleted or input(f"WARNING: No local temporary keyfiles found for cluster {cluster_id}. "
+ f"This might not be your cluster. Are you sure you want to terminate it?\n"
+ f"Any non-empty input to shutdown cluster {cluster_id}. "
+ f"Empty input to exit with cluster still alive:"):
+ for provider in providers:
+ LOG.info("Terminating cluster %s on on cloud %s",
+ cluster_id, provider.cloud_specification['identifier'])
+ server_list = provider.list_servers()
+ cluster_server_state += terminate_servers(server_list, cluster_id, provider)
+ cluster_keypair_state.append(delete_keypairs(provider, tmp_keyname))
+ ac_state = delete_application_credentials(providers[0], cluster_id)
+ terminate_output(cluster_server_state, cluster_keypair_state, ac_state, cluster_id)
+ return 0
+
+
+def terminate_servers(server_list, cluster_id, provider):
+ """
+ Terminates all servers in server_list that match the bibigrid regex.
+ @param server_list: list of server dicts. All servers are from provider
+ @param cluster_id: id of cluster to terminate
+ @param provider: provider that holds all servers in server_list
+ @return: a list of the servers' (that were to be terminated) termination states
+ """
+ LOG.info("Deleting servers on provider %s...", provider.cloud_specification['identifier'])
+ cluster_server_state = []
+ # ^(master-{cluster_id}|worker-{cluster_id}|worker-[0-9]+-[0-9]+-{cluster_id})$
+ server_regex = re.compile(fr"^bibigrid-(master-{cluster_id}+|(worker|vpnwkr)\d+-{cluster_id}+-\d+)$")
+ for server in server_list:
+ if server_regex.match(server["name"]):
+ LOG.info("Trying to terminate Server %s on cloud %s.",
+ server['name'], provider.cloud_specification['identifier'])
+ cluster_server_state.append(terminate_server(provider, server))
+ return cluster_server_state
+
+
+def terminate_server(provider, server):
+ """
+ Terminates a single server and stores the termination state
+ @param provider: the provider that holds the server
+ @param server: the server that is to be terminated
+ @return: true if the server has been terminated, false else
+ """
+ terminated = provider.delete_server(server["id"])
+ if not terminated:
+ LOG.warning("Unable to terminate server %s on provider %s.",
+ server['name'], provider.cloud_specification['identifier'])
+ else:
+ LOG.info("Server %s terminated on provider %s.",
+ server['name'], provider.cloud_specification['identifier'])
+ return terminated
+
+
+def delete_keypairs(provider, tmp_keyname):
+ """
+ Deletes keypairs from all provider
+ @param provider: provider to delete keypair from
+ @param tmp_keyname: BiBiGrid2 keyname
+ @return: True if keypair was deleted
+ """
+ LOG.info("Deleting Keypair on provider %s...", provider.cloud_specification['identifier'])
+ deleted = provider.delete_keypair(tmp_keyname)
+ if deleted:
+ LOG.info("Keypair %s deleted on provider %s.", tmp_keyname, provider.cloud_specification['identifier'])
+ else:
+ LOG.warning("Unable to delete %s on provider %s.", tmp_keyname, provider.cloud_specification['identifier'])
+ return deleted
+
+
+def delete_local_keypairs(tmp_keyname):
+ """
+ Deletes local keypairs of a cluster
+ @param tmp_keyname: BiBiGrid2 keyname
+ @return: Returns true if at least one local keyfile (pub or private) was found
+ """
+ success = False
+ LOG.info("Deleting Keypair locally...")
+ tmp_keypath = os.path.join(create.KEY_FOLDER, tmp_keyname)
+ pub_tmp_keypath = tmp_keypath + ".pub"
+ if os.path.isfile(tmp_keypath):
+ os.remove(tmp_keypath)
+ success = True
+ else:
+ LOG.warning(f"Unable to find private keyfile '{tmp_keypath}' locally. No local private keyfile deleted.")
+ if os.path.isfile(pub_tmp_keypath):
+ os.remove(pub_tmp_keypath)
+ success = True
+ else:
+ LOG.warning(f"Unable to find public keyfile '{pub_tmp_keypath}' locally. No local public keyfile deleted.")
+ return success
+
+
+def delete_application_credentials(master_provider, cluster_id):
+ """
+ Deletes application credentials from the master_provider
+ @param master_provider: provider that holds the master
+ @param cluster_id:
+ @return: True if no cluster credential remains on the provider. Else False.
+ """
+ # implement deletion
+ auth = master_provider.cloud_specification["auth"]
+ if not auth.get("application_credential_id") or not auth.get("application_credential_secret"):
+ return master_provider.delete_application_credential_by_id_or_name(create.AC_NAME.format(cluster_id=cluster_id))
+ LOG.info("Because you used application credentials to authenticate, "
+ "no created application credentials need deletion.")
+ return True
+
+
+def terminate_output(cluster_server_state, cluster_keypair_state, ac_state, cluster_id):
+ """
+ Logs the termination result in detail
+ @param cluster_server_state: list of bools. Each bool stands for a server termination
+ @param cluster_keypair_state: list of bools. Each bool stands for a keypair deletion
+ @param ac_state: bool that stands for the deletion of the credentials on the master
+ @param cluster_id:
+ @return:
+ """
+ cluster_existed = bool(cluster_server_state)
+ cluster_server_terminated = all(cluster_server_state)
+ cluster_keypair_deleted = all(cluster_keypair_state)
+ if cluster_existed:
+ if cluster_server_terminated:
+ LOG.info("Terminated all servers of cluster %s.", cluster_id)
+ else:
+ LOG.warning("Unable to terminate all servers of cluster %s.", cluster_id)
+ if cluster_keypair_deleted:
+ LOG.info("Deleted all keypairs of cluster %s.", cluster_id)
+ else:
+ LOG.warning("Unable to delete all keypairs of cluster %s.", cluster_id)
+ if cluster_server_terminated and cluster_keypair_deleted:
+ out = f"Successfully terminated cluster {cluster_id}."
+ LOG.info(out)
+ print(out)
+ else:
+ LOG.warning("Unable to terminate cluster %s properly."
+ "\nAll servers terminated: %s\nAll keys deleted: %s",
+ cluster_id, cluster_server_terminated, cluster_keypair_deleted)
+ if ac_state:
+ LOG.info("Successfully handled application credential of cluster %s.", cluster_id)
+ else:
+ LOG.warning("Unable to delete application credential of cluster %s", cluster_id)
+ else:
+ LOG.warning("Unable to find any servers for cluster-id %s. "
+ "Check cluster-id and configuration.\nAll keys deleted: %s",
+ cluster_id, cluster_keypair_deleted)
diff --git a/bibigrid2/core/actions/update.py b/bibigrid2/core/actions/update.py
new file mode 100644
index 000000000..091e39300
--- /dev/null
+++ b/bibigrid2/core/actions/update.py
@@ -0,0 +1,27 @@
+"""
+Module that contains methods to update the master playbook
+"""
+
+import logging
+
+from bibigrid2.core.utility import ansible_commands as aC
+from bibigrid2.core.utility.handler import ssh_handler
+from bibigrid2.core.utility.paths import ansible_resources_path as aRP
+from bibigrid2.core.utility.paths import bin_path as biRP
+from bibigrid2.core.utility.handler import cluster_ssh_handler
+
+LOG = logging.getLogger("bibigrid")
+
+def update(cluster_id, master_provider, master_configuration):
+ LOG.info("Starting update...")
+ master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(cluster_id, master_provider,
+ master_configuration)
+ if master_ip and ssh_user and used_private_key:
+ LOG.info("Trying to update %s@%s", master_ip, ssh_user)
+ ssh_handler.execute_ssh(floating_ip=master_ip, private_key=used_private_key, username=ssh_user,
+ commands=[aC.EXECUTE],
+ filepaths=[(aRP.PLAYBOOK_PATH, aRP.PLAYBOOK_PATH_REMOTE),
+ (biRP.BIN_PATH, biRP.BIN_PATH_REMOTE)])
+ return 0
+
+ return 1
diff --git a/bibigrid2/core/actions/version.py b/bibigrid2/core/actions/version.py
new file mode 100644
index 000000000..0ddbdb45d
--- /dev/null
+++ b/bibigrid2/core/actions/version.py
@@ -0,0 +1,6 @@
+"""
+Contains the static variable __version__ which holds the current version number.
+https://www.akeeba.com/how-do-version-numbers-work.html
+"""
+
+__version__ = "0.2.0"
diff --git a/bibigrid2/core/provider.py b/bibigrid2/core/provider.py
new file mode 100644
index 000000000..1c50c8bb6
--- /dev/null
+++ b/bibigrid2/core/provider.py
@@ -0,0 +1,210 @@
+"""
+Holds the abstract class Provider
+"""
+
+
+class Provider: # pylint: disable=too-many-public-methods
+ """
+ See in detailed return value information in tests>provider>test_Provider.
+ Make sure to register your newly implemented provider in provider_handler: name:class
+ This will automatically register it for testing when startupTests main is called.
+ """
+ NAME = "Provider"
+
+ class QuotaExceededException(Exception):
+ """
+ Just a renamed Exception.
+ """
+
+ def __init__(self, cloud_specification):
+ """
+ Call necessary methods to create a connection and save cloud_specification data as needed.
+ """
+ self.cloud_specification = cloud_specification # contains sensitive information!
+ self.cloud_specification["identifier"] = self.cloud_specification.get('profile') or self.cloud_specification[
+ 'auth'].get('project_id') or self.cloud_specification["auth"].get('application_credential_id') or "Unknown"
+
+ def create_application_credential(self, name=None):
+ """
+ Creates an application credential with name name
+ :param name: Name of new application credential
+ :return: the application credential dictionary
+ """
+
+ def delete_application_credential_by_id_or_name(self, ac_id_or_name):
+ """
+ Deletes existing application credential by id or name and returns true.
+ If application credential not found it returns false.
+ :param ac_id_or_name: application credential id or name
+ :return: True if deleted else false
+ """
+
+ def get_image_by_id_or_name(self, image_id_or_name):
+ """
+ Returns image that has id or name image_id_or_name
+ :param image_id_or_name: identifier
+ :return: said image (dict) or none if not found
+ """
+
+ def get_flavor(self, instance_type):
+ """
+ Returns flavor that has id or name flavor_id_or_name
+ :param instance_type: identifier
+ :return: said flavor (dict) or none if not found
+ """
+
+ def get_volume_snapshot_by_id_or_name(self, snapshot_id_or_name):
+ """
+ Returns snapshot that has id or name snapshot_id_or_name
+ :param snapshot_id_or_name: identifier
+ :return: said snapshot (dict) or none if not found
+ """
+
+ def get_network_by_id_or_name(self, network_id_or_name):
+ """
+ Returns network that has id or name network_id_or_name
+ :param network_id_or_name: identifier
+ :return: said network (dict) or none if not found
+ """
+
+ def get_subnet_by_id_or_name(self, subnet_id_or_name):
+ """
+ Returns subnet that has id or name subnet_id_or_name
+ :param subnet_id_or_name: identifier
+ :return: said subnet (dict) or none if not found
+ """
+
+ def list_servers(self):
+ """
+ Returns a list of all servers on logged in provider
+ :return: said list of servers or empty list if none found
+ """
+
+ def create_server(self, name, flavor, image, network, key_name=None, wait=True, volumes=None): # pylint: disable=too-many-arguments
+ """
+ Creates a new server and waits for it to be accessible if wait=True. If volumes are given, they are attached.
+ Returns said server (dict)
+ :param name: name (str)
+ :param flavor: flavor/type (str)
+ :param image: image/bootable-medium (str)
+ :param network: network (str)
+ :param key_name: (str)
+ :param wait: (bool)
+ :param volumes: List of volumes (list (str))
+ :return: server (dict)
+ """
+
+ def delete_server(self, name_or_id, delete_ips=True):
+ """
+ Deletes server and floating_ip as well if delete_ips is true. The resource is then free again
+ :param name_or_id:
+ :param delete_ips:
+ :return: True if delete succeeded, False otherwise
+ """
+
+ def delete_keypair(self, key_name):
+ """
+ Deletes keypair with key_name
+ :param key_name: (str)
+ :return: True if delete succeeded, False otherwise
+ """
+
+ def get_server_group_by_id_or_name(self, server_group_id_or_name):
+ """
+ Returns server_group that has id or name server_group_id_or_name
+ :param server_group_id_or_name: identifier
+ :return: said server_group (dict) or none if not found
+ """
+
+ def close(self):
+ """
+ Closes connection
+ :return:
+ """
+
+ def create_keypair(self, name, public_key):
+ """
+ Creates a new keypair with name name and public_key public_key
+ :param name: name of new keypair
+ :param public_key: public_key of new keypair
+ :return:
+ """
+
+ def get_network_id_by_subnet(self, subnet):
+ """
+ Gets network_id by subnet
+ :param subnet: id (str)
+ :return: (str)
+ """
+
+ def get_subnet_ids_by_network(self, network):
+ """
+ Gets subnet_ids (list (str)) by network_id
+ :param network: id (str)
+ :return: subnet_ids (list (str))
+ """
+
+ def get_free_resources(self):
+ """
+ Gets free resources. If a resource cannot be determined, assume maximum is free.
+ :return: Dictionary containing the free resources
+ """
+
+ def get_volume_by_id_or_name(self, name_or_id):
+ """
+ Returns volume that has id or name name_or_id
+ :param name_or_id: identifier
+ :return: said volume (dict) or none if not found
+ """
+
+ def create_volume_from_snapshot(self, snapshot_name_or_id):
+ """
+ Creates a volume from snapshot.
+ :param snapshot_name_or_id: name or id of snapshot
+ :return: id of created volume or none if failed
+ """
+
+ def get_external_network(self, network_name_or_id):
+ """
+ Finds router interface with network id equal to given network and by that the external network.
+ :param network_name_or_id: Name or id of network
+ :return: Corresponding external network
+ """
+
+ def add_auto_ip(self, server, wait=False, timeout=60, reuse=True):
+ """
+ Add a floating IP to a server.
+ Will reuse floating ips or create a new one if no floating-ip is down.
+ :param server: the server that said floating ip will be attached to
+ :param wait: wait for floating-ip to be assigned
+ :param timeout: when to accept failing
+ :param reuse: if False will just create a new floating-ip and not reuse an existing down one
+ :return: the floating-ip
+ """
+
+ def attach_available_floating_ip(self, network=None, server=None):
+ """
+ Get a floating IP from a network or a pool and attach it to the server
+ :param network:
+ :param server:
+ :return:
+ """
+
+ def get_images(self):
+ """
+ Get a generator able ot generate all images
+ @return: A generator able ot generate all images
+ """
+
+ def get_flavors(self):
+ """
+ Get a generator able ot generate all flavors
+ @return: A generator able ot generate all flavors
+ """
+
+ def get_active_images(self):
+ return [image["name"] for image in self.get_images() if image["status"].lower() == "active"]
+
+ def get_active_flavors(self):
+ return [flavor["name"] for flavor in self.get_flavors()
+ if "legacy" not in flavor["name"].lower() and "deprecated" not in flavor["name"].lower()]
diff --git a/bibigrid2/core/startup.py b/bibigrid2/core/startup.py
new file mode 100755
index 000000000..7973d2ca5
--- /dev/null
+++ b/bibigrid2/core/startup.py
@@ -0,0 +1,139 @@
+"""
+Contains main method. Interprets command line, sets logging and starts corresponding action.
+"""
+import logging
+import math
+import os
+import sys
+import time
+import traceback
+
+import yaml
+
+from bibigrid2.core.actions import check, create, ide, list_clusters, terminate_cluster, update, version
+from bibigrid2.core.utility import command_line_interpreter
+from bibigrid2.core.utility.handler import configuration_handler, provider_handler
+
+LOGGING_HANDLER_LIST = [logging.StreamHandler(), logging.FileHandler("bibigrid2.log")] # stdout and to file
+VERBOSITY_LIST = [logging.WARNING, logging.INFO, logging.DEBUG]
+LOGGER_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
+
+LOG = logging.getLogger("bibigrid")
+
+
+def get_cluster_id_from_mem():
+ """
+ Reads the cluster_id of the last created cluster and returns it. Used if no cluster_id is given.
+
+ @return: cluster_id. If no mem file can be found, the file is not a valid yaml file or doesn't contain a cluster_id,
+ it returns none.
+ """
+ if os.path.isfile(create.CLUSTER_MEMORY_PATH):
+ try:
+ with open(create.CLUSTER_MEMORY_PATH, mode="r", encoding="UTF-8") as cluster_memory_file:
+ mem_dict = yaml.safe_load(stream=cluster_memory_file)
+ return mem_dict.get("cluster_id")
+ except yaml.YAMLError as exc:
+ LOG.warning("Couldn't read configuration %s: %s", create.CLUSTER_MEMORY_PATH, exc)
+ return None
+
+
+def set_logger(verbosity):
+ """
+ Sets verbosity, format and handler.
+ :param verbosity: level of verbosity
+ :return:
+ """
+
+ capped_verbosity = min(verbosity, len(VERBOSITY_LIST) - 1)
+ # LOG.basicConfig(format=LOGGER_FORMAT, level=VERBOSITY_LIST[capped_verbosity],
+ # handlers=LOGGING_HANDLER_LIST)
+ logging.basicConfig(format=LOGGER_FORMAT, handlers=LOGGING_HANDLER_LIST)
+
+ log = logging.getLogger("bibigrid")
+ log.setLevel(VERBOSITY_LIST[capped_verbosity])
+
+ log.debug(f"Logging verbosity set to {capped_verbosity}")
+
+
+def run_action(args, configurations, config_path): # pylint: disable=too-many-nested-blocks,too-many-branches
+ """
+ Uses args to decide which action will be executed and executes said action.
+ :param args: command line arguments
+ :param configurations: list of configurations (dicts)
+ :param config_path: path to configurations-file
+ :return:
+ """
+ if args.version:
+ LOG.info("Action version selected")
+ print(version.__version__)
+ return 0
+
+ start_time = time.time()
+ exit_state = 0
+ try:
+ providers = provider_handler.get_providers(configurations)
+ if providers:
+ if args.list_clusters:
+ LOG.info("Action list_clusters selected")
+ exit_state = list_clusters.print_list_clusters(args.cluster_id, providers)
+ elif args.check:
+ LOG.info("Action check selected")
+ exit_state = check.check(configurations, providers)
+ elif args.create:
+ LOG.info("Action create selected")
+ creator = create.Create(providers=providers,
+ configurations=configurations,
+ debug=args.debug,
+ config_path=config_path)
+ print("Creating a new cluster takes about 10 or more minutes depending on your cloud provider "
+ "and your configuration. Be patient.")
+ exit_state = creator.create()
+ else:
+ if not args.cluster_id:
+ args.cluster_id = get_cluster_id_from_mem()
+ LOG.info("No cid (cluster_id) specified. Defaulting to last created cluster: %s",
+ args.cluster_id or 'None found')
+ if args.cluster_id:
+ if args.terminate_cluster:
+ LOG.info("Action terminate_cluster selected")
+ exit_state = terminate_cluster.terminate_cluster(args.cluster_id, providers, args.debug)
+ elif args.ide:
+ LOG.info("Action ide selected")
+ exit_state = ide.ide(args.cluster_id, providers[0], configurations[0])
+ elif args.update:
+ LOG.info("Action update selected")
+ exit_state = update.update(args.cluster_id, providers[0], configurations[0])
+ else:
+ LOG.warning("Please make use of -cid .")
+ for provider in providers:
+ provider.close()
+ else:
+ exit_state = 1
+ except Exception as err: # pylint: disable=broad-except
+ if args.debug:
+ traceback.print_exc()
+ else:
+ LOG.error(err)
+ exit_state = 2
+ time_in_s = time.time() - start_time
+ print(f"--- {math.floor(time_in_s / 60)} minutes and {time_in_s % 60} seconds ---")
+ return exit_state
+
+
+def main():
+ """
+ Interprets command line, sets logger, reads configuration and runs selected action. Then exits.
+ :return:
+ """
+
+ args = command_line_interpreter.interpret_command_line()
+ set_logger(args.verbose)
+ configurations = configuration_handler.read_configuration(args.config_input)
+ if configurations:
+ sys.exit(run_action(args, configurations, args.config_input))
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/bibigrid2/core/utility/ansible_commands.py b/bibigrid2/core/utility/ansible_commands.py
new file mode 100644
index 000000000..c84030d87
--- /dev/null
+++ b/bibigrid2/core/utility/ansible_commands.py
@@ -0,0 +1,58 @@
+"""
+Module containing a bunch of useful commands to be used by sshHandler.py for cluster setup
+"""
+
+import os
+import bibigrid2.core.utility.paths.ansible_resources_path as aRP
+
+#TO_LOG = "| sudo tee -a /var/log/ansible.log"
+#AIY = "apt-get -y install"
+#SAU = "sudo apt-get update"
+#NO_KEY_CHECK = "export ANSIBLE_HOST_KEY_CHECKING=False"
+NO_UPDATE = ("""sudo sed -i 's/APT::Periodic::Unattended-Upgrade "1";/APT::Periodic::Unattended-Upgrade "0";/g' """
+ """/etc/apt/apt.conf.d/20auto-upgrades""", "Disable apt auto update.")
+# Setup (Python for everyone)
+# UPDATE = f"sudo {AU} {TO_LOG}"
+# PIP = f"sudo pip3 install --upgrade pip {TO_LOG}"
+# SETUPTOOLS = "sudo pip3 install setuptools"
+# LOG = "export ANSIBLE_LOG_PATH=~/ansible.log"
+WAIT_READY = ('while sudo lsof /var/lib/dpkg/lock 2> null; do echo "/var/lib/dpkg/lock locked - wait for 10 seconds"; '
+ 'sleep 10; done', "Wait for dpkg lock removed.")
+# SLEEP_10 = "sleep 10s"
+# RANDOM = "sudo DEBIAN_FRONTEND=noninteractive apt-get --yes install apt-transport-https ca-certificates " \
+# "software-properties-common python3 python3-pip libffi-dev libssl-dev"
+# PYTHON_WORKERS = f'ansible workers -i "{aRP.HOSTS_CONFIG_FILE_REMOTE}" --become -m raw -a "{SAU} && {AIY} python3' \
+# f'"'
+
+# Test Ansible
+# PING = (f'ansible -i "{aRP.HOSTS_CONFIG_FILE_REMOTE}" all -m ping',"Ping all hosts using ansible.")
+# OK = ('if [ $? -eq 0 ]; then echo "Ansible configuration seems to work properly."; '
+# 'else echo"Ansible hosts not reachable. There seems to be a misconfiguration."; fi',"Check for ")
+
+# Run ansible-galaxy to install ansible-galaxy roles from galaxy, git or url (.tar.gz)
+# GALAXY = f"ansible-galaxy install --roles-path {aRP.ADDITIONAL_ROLES_ROOT_PATH_REMOTE} -r {aRP.REQUIREMENTS_YML}"
+
+# Extract ansible roles from files (.tar.gz, .tgz)
+# EXTRACT = f"for f in $(find /tmp/roles -type f -regex '.*\\.t\\(ar\\.\\)?gz'); " \
+# f"do tar -xzf $f -C {aRP.ADDITIONAL_ROLES_ROOT_PATH_REMOTE}; done"
+
+# Fix line endings for all text based ansible file to ensure windows files being used correctly
+# GET_ASCII_FILES = "files=$(for f in $( find ~/playbook -type f); do file ${f} | grep ASCII | cut -f 1 -d ':'; done;)"
+# REPLACE_ENDINGS = "for file in ${file}; do sed -i 's/\\r$//' \"${file}\"; done"
+
+# Utility
+ADD_PLAYBOOK_TO_LINUX_HOME = ("ln -s /opt/playbook ~/playbook", "Link /opt/playbook to ~/playbook.")
+
+# Execute
+PLAYBOOK_HOME = ("sudo mkdir -p /opt/playbook", "Create playbook home.")
+PLAYBOOK_HOME_RIGHTS = ("sudo chown ubuntu:ubuntu /opt/playbook", "Adjust playbook home permission.")
+MV_ANSIBLE_CONFIG = (
+ "sudo install -D /opt/playbook/ansible.cfg /etc/ansible/ansible.cfg", "Move ansible configuration.")
+EXECUTE = (f"ansible-playbook {os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.SITE_YML)} -i "
+ f"{os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.ANSIBLE_HOSTS)} -l master",
+ "Execute ansible playbook. Be patient.")
+
+# ansible setup
+UPDATE = ("sudo apt-get update", "Update apt repository lists.")
+PYTHON3_PIP = "sudo apt-get install -y python3-pip", "Install python3 pip using apt."
+ANSIBLE_PASSLIB = ("sudo pip install ansible==6.6 passlib", "Install Ansible and Passlib using pip.")
diff --git a/bibigrid2/core/utility/ansible_configurator.py b/bibigrid2/core/utility/ansible_configurator.py
new file mode 100644
index 000000000..0bce57c1e
--- /dev/null
+++ b/bibigrid2/core/utility/ansible_configurator.py
@@ -0,0 +1,305 @@
+"""
+Prepares ansible files (vars, common_configuration, ...)
+"""
+
+import logging
+
+import mergedeep
+import yaml
+
+from bibigrid2.core.actions import create
+from bibigrid2.core.actions import ide
+from bibigrid2.core.actions import list_clusters
+from bibigrid2.core.utility.handler import configuration_handler
+from bibigrid2.core.utility import id_generation
+from bibigrid2.core.utility.paths import ansible_resources_path as aRP
+from bibigrid2.core.utility import yaml_dumper
+
+DEFAULT_NFS_SHARES = ["/vol/spool"]
+ADDITIONAL_PATH = "additional/"
+PYTHON_INTERPRETER = "/usr/bin/python3"
+MASTER_ROLES = [{"role": "bibigrid", "tags": ["bibigrid", "bibigrid-master"]}]
+WORKER_ROLES = [{"role": "bibigrid", "tags": ["bibigrid", "bibigrid-worker"]}]
+VARS_FILES = [aRP.INSTANCES_YML, aRP.CONFIG_YML]
+IDE_CONF = {"ide": False, "workspace": ide.DEFAULT_IDE_WORKSPACE, "port_start": ide.REMOTE_BIND_ADDRESS,
+ "port_end": ide.DEFAULT_IDE_PORT_END, "build": False}
+ZABBIX_CONF = {"db": "zabbix", "db_user": "zabbix", "db_password": "zabbix", "timezone": "Europe/Berlin",
+ "server_name": "bibigrid", "admin_password": "bibigrid"}
+SLURM_CONF = {"db": "slurm", "db_user": "slurm", "db_password": "changeme",
+ "munge_key": id_generation.generate_munge_key(),
+ "elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 900, "TreeWidth": 128}}
+LOG = logging.getLogger("bibigrid")
+
+def generate_site_file_yaml(custom_roles):
+ """
+ Generates site_yaml (dict).
+ Deepcopy is used in case roles might differ between servers in the future.
+ :param custom_roles: ansibleRoles given by the config
+ :return: site_yaml (dict)
+ """
+ site_yaml = [{'hosts': 'master', "become": "yes",
+ "vars_files": VARS_FILES, "roles": MASTER_ROLES},
+ {"hosts": "workers", "become": "yes", "vars_files": VARS_FILES,
+ "roles": WORKER_ROLES}] # ,
+ # {"hosts": "vpnwkr", "become": "yes", "vars_files": copy.deepcopy(VARS_FILES),
+ # "roles": ["common", "vpnwkr"]}]
+ # add custom roles and vars
+ for custom_role in custom_roles:
+ VARS_FILES.append(custom_role["vars_file"])
+ MASTER_ROLES.append(ADDITIONAL_PATH + custom_role["name"])
+ WORKER_ROLES.append(ADDITIONAL_PATH + custom_role["name"])
+ return site_yaml
+
+
+def generate_instances_yaml(cluster_dict, configuration, provider, cluster_id): # pylint: disable=too-many-locals
+ """
+ ToDo filter what information really is necessary. Determined by further development
+ Filters unnecessary information
+ :param cluster_dict: cluster_dict to get the information from
+ :param configuration: configuration of master cloud ToDo needs to be list in the future
+ :param provider: provider of master cloud ToDo needs to be list in the future
+ :param cluster_id: To get proper naming
+ :return: filtered information (dict)
+ """
+ LOG.info("Generating instances file...")
+ workers = []
+ flavor_keys = ["name", "ram", "vcpus", "disk", "ephemeral"]
+ for index, worker in enumerate(configuration.get("workerInstances", [])):
+ flavor = provider.get_flavor(worker["type"])
+ flavor_dict = {key: flavor[key] for key in flavor_keys}
+ image = worker["image"]
+ network = configuration["network"]
+ worker_range = "[0-{}]"
+ name = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id,
+ additional=worker_range.format(worker.get('count', 1) - 1))
+ regexp = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id,
+ additional=r"\d+")
+ workers.append({"name": name, "regexp": regexp, "image": image, "network": network, "flavor": flavor_dict})
+ master = {key: cluster_dict["master"][key] for key in
+ ["name", "private_v4", "public_v4", "public_v6", "cloud_specification"]}
+ master["flavor"] = {key: cluster_dict["master"]["flavor"][key] for key in flavor_keys}
+ return {"master": master, "workers": workers}
+
+
+def pass_through(dict_from, dict_to, key_from, key_to=None):
+ """
+ If key is defined in dict_from, set key of dict_to to value of corresponding value of dict_from. Happens in place.
+ @param key_from:
+ @param key_to:
+ @param dict_from:
+ @param dict_to:
+ @return:
+ """
+ if not key_to:
+ key_to = key_from
+ if dict_from.get(key_from):
+ dict_to[key_to] = dict_from[key_from]
+
+
+def generate_common_configuration_yaml(cidrs, configuration, cluster_id, ssh_user, default_user):
+ """
+ Generates common_configuration yaml (dict)
+ :param cidrs: str subnet cidrs (provider generated)
+ :param configuration: master configuration (first in file)
+ :param cluster_id: Id of cluster
+ :param ssh_user: user for ssh connections
+ :param default_user: Given default user
+ :return: common_configuration_yaml (dict)
+ """
+ LOG.info("Generating common configuration file...")
+ # print(configuration.get("slurmConf", {}))
+ common_configuration_yaml = {"cluster_id": cluster_id, "cluster_cidrs": cidrs,
+ "default_user": default_user,
+ "local_fs": configuration.get("localFS", False),
+ "local_dns_lookup": configuration.get("localDNSlookup", False),
+ "use_master_as_compute": configuration.get("useMasterAsCompute", True),
+ "enable_slurm": configuration.get("slurm", False),
+ "enable_zabbix": configuration.get("zabbix", False),
+ "enable_nfs": configuration.get("nfs", False),
+ "enable_ide": configuration.get("ide", False),
+ "slurm": configuration.get("slurm", True), "ssh_user": ssh_user,
+ "slurm_conf": mergedeep.merge({}, SLURM_CONF, configuration.get("slurmConf", {}),
+ strategy=mergedeep.Strategy.TYPESAFE_REPLACE)
+ }
+ if configuration.get("nfs"):
+ nfs_shares = configuration.get("nfsShares", [])
+ nfs_shares = nfs_shares + DEFAULT_NFS_SHARES
+ common_configuration_yaml["nfs_mounts"] = [{"src": "/" + nfs_share, "dst": "/" + nfs_share}
+ for nfs_share in nfs_shares]
+ common_configuration_yaml["ext_nfs_mounts"] = [{"src": ext_nfs_share, "dst": ext_nfs_share} for
+ ext_nfs_share in (configuration.get("extNfsShares", []))]
+
+ if configuration.get("ide"):
+ common_configuration_yaml["ide_conf"] = mergedeep.merge({}, IDE_CONF, configuration.get("ideConf", {}),
+ strategy=mergedeep.Strategy.TYPESAFE_REPLACE)
+ if configuration.get("zabbix"):
+ common_configuration_yaml["zabbix_conf"] = mergedeep.merge({}, ZABBIX_CONF, configuration.get("zabbixConf", {}),
+ strategy=mergedeep.Strategy.TYPESAFE_REPLACE)
+
+ for from_key, to_key in [("waitForServices", "wait_for_services"), ("ansibleRoles", "ansible_roles"),
+ ("ansibleGalaxyRoles", "ansible_galaxy_roles")]:
+ pass_through(configuration, common_configuration_yaml, from_key, to_key)
+ return common_configuration_yaml
+
+
+def generate_ansible_hosts_yaml(ssh_user, configuration, cluster_id):
+ """
+ Generates ansible_hosts_yaml (inventory file).
+ :param ssh_user: str global SSH-username
+ :param configuration: dict
+ :param cluster_id: id of cluster
+ :return: ansible_hosts yaml (dict)
+ """
+ LOG.info("Generating ansible hosts file...")
+ ansible_hosts_yaml = {"master": {"hosts": {"localhost": to_instance_host_dict(ssh_user)}},
+ "workers": {"hosts": {}, "children": {"ephemeral": {"hosts": {}}}}
+ }
+ # vpnwkr are handled like workers on this level
+ workers = ansible_hosts_yaml["workers"]
+ for index, worker in enumerate(configuration.get("workerInstances", [])):
+ name = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id,
+ additional=f"[0:{worker.get('count', 1) - 1}]")
+ worker_dict = to_instance_host_dict(ssh_user, ip="", local=False)
+ if "ephemeral" in worker["type"]:
+ workers["children"]["ephemeral"]["hosts"][name] = worker_dict
+ else:
+ workers["hosts"][name] = worker_dict
+ return ansible_hosts_yaml
+
+
+def to_instance_host_dict(ssh_user, ip="localhost", local=True): # pylint: disable=invalid-name
+ """
+ Generates host entry
+ :param ssh_user: str global SSH-username
+ :param ip: str ip
+ :param local: bool
+ :return: host entry (dict)
+ """
+ host_yaml = {"ansible_connection": "local" if local else "ssh",
+ "ansible_python_interpreter": PYTHON_INTERPRETER,
+ "ansible_user": ssh_user}
+ if ip:
+ host_yaml["ip"] = ip
+ return host_yaml
+
+
+def get_cidrs(configurations, providers):
+ """
+ Gets cidrs of all subnets in all providers
+ :param configurations: list of configurations (dict)
+ :param providers: list of providers
+ :return:
+ """
+ all_cidrs = []
+ for provider, configuration in zip(providers, configurations):
+ provider_cidrs = {"provider": type(provider).__name__, "provider_cidrs": []}
+ if isinstance(configuration["subnet"], list):
+ for subnet_id_or_name in configuration["subnet"]:
+ subnet = provider.get_subnet_by_id_or_name(subnet_id_or_name)
+ provider_cidrs["provider_cidrs"].append(subnet["cidr"]) # check key again
+ else:
+ subnet = provider.get_subnet_by_id_or_name(configuration["subnet"])
+ provider_cidrs["provider_cidrs"].append(subnet["cidr"])
+ all_cidrs.append(provider_cidrs)
+ return all_cidrs
+
+
+def get_ansible_roles(ansible_roles):
+ """
+ Checks if ansible_roles have all necessary values and returns True if so.
+ :param ansible_roles: ansible_roles from master configuration (first configuration)
+ :return: list of valid ansible_roles
+ """
+ ansible_roles_yaml = []
+ for ansible_role in (ansible_roles or []):
+ if ansible_role.get("file") and ansible_role.get("hosts"):
+ ansible_role_dict = {"file": ansible_role["file"], "hosts": ansible_role["hosts"]}
+ for key in ["name", "vars", "vars_file"]:
+ if ansible_role.get(key):
+ ansible_role_dict[key] = ansible_role[key]
+ ansible_roles_yaml.append(ansible_role_dict)
+ else:
+ LOG.warning("Ansible role %s had neither galaxy,git nor url. Not added.", ansible_role)
+ return ansible_roles_yaml
+
+
+def get_ansible_galaxy_roles(ansible_galaxy_roles):
+ """
+ Checks if ansible_galaxy_role have all necessary values and adds it to the return list if so.
+ :param ansible_galaxy_roles:
+ :return: list of valid ansible_galaxy_roles
+ """
+ ansible_galaxy_roles_yaml = []
+ for ansible_galaxy_role in (ansible_galaxy_roles or []):
+ if ansible_galaxy_role.get("galaxy") or ansible_galaxy_role.get("git") or ansible_galaxy_role.get("url"):
+ ansible_galaxy_role_dict = {"hosts": ansible_galaxy_role["hosts"]}
+ for key in ["name", "galaxy", "git", "url", "vars", "vars_file"]:
+ if ansible_galaxy_role.get(key):
+ ansible_galaxy_role_dict[key] = ansible_galaxy_role[key]
+ ansible_galaxy_roles_yaml.append(ansible_galaxy_role_dict)
+ else:
+ LOG.warning("Galaxy role %s had neither galaxy,git nor url. Not added.", ansible_galaxy_role)
+ return ansible_galaxy_roles_yaml
+
+
+def generate_worker_specification_file_yaml(configurations):
+ """
+ Generates worker_specification_file_yaml
+ :param configurations: list of configurations (dict)
+ :return: worker_specification_yaml
+ """
+ LOG.info("Generating worker specification file...")
+ worker_groups_list = configuration_handler.get_list_by_key(configurations, "workerInstances", False)
+ # create.prepare_configuration guarantees that key is set
+ network_list = configuration_handler.get_list_by_key(configurations, "network", False)
+ worker_specification_yaml = []
+ for worker_groups_provider_list, network in zip(worker_groups_list, network_list):
+ for worker_group in worker_groups_provider_list:
+ worker_specification_yaml.append({"TYPE": worker_group["type"],
+ "IMAGE": worker_group["image"],
+ "NETWORK": network})
+ return worker_specification_yaml
+
+
+def write_yaml(path, generated_yaml, alias=False):
+ """
+ Writes generated_yaml to file path with or without alias
+ @param path:
+ @param generated_yaml:
+ @param alias:
+ @return:
+ """
+ LOG.debug("Writing yaml %s", path)
+ with open(path, mode="w+", encoding="UTF-8") as file:
+ if alias:
+ yaml.safe_dump(data=generated_yaml, stream=file)
+ else:
+ yaml.dump(data=generated_yaml, stream=file, Dumper=yaml_dumper.NoAliasSafeDumper)
+
+
+def configure_ansible_yaml(providers, configurations, cluster_id):
+ """
+ Generates and writes all ansible-configuration-yaml files.
+ :param providers: list of providers
+ :param configurations: list of configurations (dict)
+ :param cluster_id: id of cluster to create
+ :return:
+ """
+ LOG.info("Writing ansible files...")
+ alias = configurations[0].get("aliasDumper", False)
+ cluster_dict = list_clusters.dict_clusters(providers)[cluster_id]
+ ansible_roles = get_ansible_roles(configurations[0].get("ansibleRoles"))
+ default_user = providers[0].cloud_specification["auth"].get("username", configurations[0].get("sshUser", "Ubuntu"))
+ for path, generated_yaml in [
+ (aRP.WORKER_SPECIFICATION_FILE, generate_worker_specification_file_yaml(configurations)),
+ (aRP.COMMONS_CONFIG_FILE, generate_common_configuration_yaml(cidrs=get_cidrs(configurations, providers),
+ configuration=configurations[0],
+ cluster_id=cluster_id,
+ ssh_user=configurations[0]["sshUser"],
+ default_user=default_user)),
+ (aRP.COMMONS_INSTANCES_FILE, generate_instances_yaml(cluster_dict, configurations[0],
+ providers[0], cluster_id)),
+ (aRP.HOSTS_CONFIG_FILE, generate_ansible_hosts_yaml(configurations[0]["sshUser"], configurations[0],
+ cluster_id)),
+ (aRP.SITE_CONFIG_FILE, generate_site_file_yaml(ansible_roles))]:
+ write_yaml(path, generated_yaml, alias)
diff --git a/bibigrid2/core/utility/command_line_interpreter.py b/bibigrid2/core/utility/command_line_interpreter.py
new file mode 100644
index 000000000..b057bb82b
--- /dev/null
+++ b/bibigrid2/core/utility/command_line_interpreter.py
@@ -0,0 +1,44 @@
+"""
+Has necessary methods and variables to interpret the command line
+"""
+
+import argparse
+import os
+
+STANDARD_CONFIG_INPUT_PATH = os.path.expanduser("~/.config/bibigrid")
+FOLDER_START = ("~/", "/")
+
+
+def interpret_command_line():
+ """
+ Interprets commandline. Used in startup.py
+ :return:
+ """
+ parser = argparse.ArgumentParser(description='Bibigrid2 sets up cluster easily inside a cloud environment')
+ parser.add_argument("-v", "--verbose", action="count", default=0,
+ help="Increases logging verbosity. `-v` adds more info to the logfile, "
+ "`-vv` adds debug information to the logfile.")
+ parser.add_argument("-d", "--debug", action='store_true', help="Keeps cluster active. Asks before shutdown. "
+ "Offers termination after create")
+ parser.add_argument("-i", "--config_input", metavar="", help="Path to YAML configurations file. "
+ "Relative paths can be used and start "
+ "at ~/.config/bibigrid", required=True,
+ type=lambda s: s if s.startswith(FOLDER_START) else os.path.join(STANDARD_CONFIG_INPUT_PATH, s))
+ parser.add_argument("-cid", "--cluster_id", metavar="", type=str, default="",
+ help="Cluster id is needed for ide and termination")
+
+ actions = parser.add_mutually_exclusive_group(required=True)
+ actions.add_argument("-V", "--version", action='store_true', help="Displays version")
+ actions.add_argument("-t", "--terminate_cluster", action='store_true',
+ help="Terminates cluster. Needs cluster-id set.")
+ actions.add_argument("-c", "--create", action='store_true', help="Creates cluster")
+ actions.add_argument("-l", "--list_clusters", action='store_true',
+ help="Lists all running clusters. If cluster-id is set, will list this cluster in detail only")
+ actions.add_argument("-ch", "--check", action='store_true', help="Validates cluster configuration")
+ actions.add_argument("-ide", "--ide", action='store_true',
+ help="Establishes a secured connection to ide. Needs cluster-id set")
+ actions.add_argument("-u", "--update", action='store_true', help="Updates master's playbook. "
+ "Needs cluster-id set, no job running "
+ "and no workers up")
+ args = parser.parse_args()
+ return args
diff --git a/bibigrid2/core/utility/handler/cluster_ssh_handler.py b/bibigrid2/core/utility/handler/cluster_ssh_handler.py
new file mode 100644
index 000000000..78500ade0
--- /dev/null
+++ b/bibigrid2/core/utility/handler/cluster_ssh_handler.py
@@ -0,0 +1,40 @@
+"""
+This module gets information about ssh connection.
+"""
+
+import logging
+import os
+
+from bibigrid2.core.actions import create, list_clusters
+
+LOG = logging.getLogger("bibigrid")
+def get_ssh_connection_info(cluster_id, master_provider, master_configuration):
+ """
+ Gets master_ip, ssh_user and private key to enable other modules to create an ssh connection to a clusters master
+ @param cluster_id: id of cluster to connect to
+ @param master_provider: master's provider
+ @param master_configuration: master's configuration
+ @return: triple (master_ip, ssh_user, private_key)
+ """
+ # If cluster_id is an ip, cluster_id will be used for master_ip
+ if "." in cluster_id:
+ LOG.info("Interpreting %s as ip since it doesn't match cluster_id", cluster_id)
+ master_ip = cluster_id
+ else:
+ master_ip = list_clusters.get_master_access_ip(cluster_id, master_provider)
+ ssh_user = master_configuration.get("sshUser")
+ public_keys = master_configuration.get("sshPublicKeyFiles")
+ used_private_key = None
+
+ # first check configuration then if not found take the temporary key
+ if public_keys:
+ public_key = public_keys[0]
+ if isinstance(public_key, str):
+ private_key = public_key.strip(".pub")
+ if os.path.isfile(private_key):
+ used_private_key = private_key
+ if not used_private_key:
+ private_key = os.path.join(create.KEY_FOLDER, create.KEY_NAME.format(cluster_id=cluster_id))
+ if os.path.isfile(private_key):
+ used_private_key = private_key
+ return master_ip, ssh_user, used_private_key
diff --git a/bibigrid2/core/utility/handler/configuration_handler.py b/bibigrid2/core/utility/handler/configuration_handler.py
new file mode 100644
index 000000000..51e555e3c
--- /dev/null
+++ b/bibigrid2/core/utility/handler/configuration_handler.py
@@ -0,0 +1,142 @@
+"""
+This module contains methods to read the configuration and cloud specification.
+"""
+
+import logging
+import os
+
+import mergedeep
+import yaml
+
+CLOUDS_YAML_PATHS = ["~/.config/bibigrid", "/etc/bibigrid", ""]
+CLOUDS_YAML = "clouds.yaml"
+CLOUDS_PUBLIC_YAML = "clouds-public.yaml"
+CLOUD_ROOT_KEY = "clouds"
+CLOUD_PUBLIC_ROOT_KEY = "public-clouds"
+CLOUDS_PUBLIC_NAME_KEY = "profile"
+CLOUD_CONFIGURATION_KEY = "cloud"
+
+LOG = logging.getLogger("bibigrid")
+
+def read_configuration(path="bibigrid.yml"):
+ """
+ Reads yaml from file and returns the list of all configurations
+ :param path: Path to yaml file
+ :return: configurations (dict)
+ """
+ configuration = None
+ if os.path.isfile(path):
+ with open(path, mode="r", encoding="UTF-8") as stream:
+ try:
+ configuration = yaml.safe_load(stream)
+ except yaml.YAMLError as exc:
+ LOG.warning("Couldn't read configuration %s: %s", path, exc)
+ else:
+ LOG.warning("No such configuration file %s.", path)
+ return configuration
+
+
+def get_list_by_key(configurations, key, get_empty=True):
+ """
+ Returns a list of objects which are value to the key.
+ :param get_empty: if true empty configurations return None
+ :param configurations: YAML of configuration File containing the configuration-data for each provider
+ :param key: Key that is looked out for
+ :return: List of values of said key through all configs
+ """
+ return [configuration.get(key) for configuration in configurations if configuration.get(key) or get_empty]
+
+
+# def get_dict_list_by_key_list(configurations, keys, get_empty=True):
+# return [{key: configuration.get(key) for key in keys if configuration.get(key) or get_empty}
+# for configuration in configurations]
+
+
+def find_file_in_folders(file_name, folders):
+ """
+ Searches all folders for a file with name file_name, loads (expects yaml) the first match and returns the dict
+ @param file_name: name of the file to look for
+ @param folders: folders to search for file named file_name
+ @return: dict of match content or None if not found
+ """
+ for folder_path in folders:
+ file_path = os.path.expanduser(os.path.join(folder_path, file_name))
+ if os.path.isfile(file_path):
+ LOG.debug("File %s found in folder %s.", file_name, folder_path)
+ return read_configuration(file_path)
+ LOG.debug("File %s in folder %s not found.", file_name, folder_path)
+ return None
+
+
+def get_clouds_files():
+ """
+ Wrapper to call find_file_in_folders with the right arguments to find the clouds.yaml and clouds-public.yaml
+ @return: tuple of dicts containing the clouds.yaml and clouds-public.yaml data or None if not found.
+ """
+ clouds_yaml = find_file_in_folders(CLOUDS_YAML, CLOUDS_YAML_PATHS)
+ clouds_public_yaml = find_file_in_folders(CLOUDS_PUBLIC_YAML, CLOUDS_YAML_PATHS)
+ clouds = None
+ clouds_public = None
+ if clouds_yaml:
+ clouds = clouds_yaml.get(CLOUD_ROOT_KEY)
+ if not clouds:
+ LOG.warning("%s is not valid. Must contain key '%s:'", CLOUDS_YAML, CLOUD_ROOT_KEY)
+ else:
+ LOG.warning("No %s at %s! Please copy your %s to one of those listed folders. Aborting...",
+ CLOUDS_YAML, CLOUDS_YAML_PATHS, CLOUDS_YAML)
+ if clouds_public_yaml:
+ clouds_public = clouds_public_yaml.get(CLOUD_PUBLIC_ROOT_KEY)
+ if not clouds_public:
+ LOG.warning("%s is not valid. Must contain key '%s'", CLOUDS_PUBLIC_YAML, CLOUD_PUBLIC_ROOT_KEY)
+ return clouds, clouds_public
+
+
+def get_cloud_specification(cloud_name, clouds, clouds_public):
+ """
+ As in openstack cloud_public_specification will be overwritten by cloud_private_specification
+ :param cloud_name: name of the cloud to look for in clouds.yaml
+ :param clouds: dict containing the data loaded from clouds.yaml
+ :param clouds_public: dict containing the data loaded from clouds-public.yaml
+ :return:
+ """
+ cloud_full_specification = {}
+ cloud_private_specification = clouds.get(cloud_name)
+ if cloud_private_specification:
+ cloud_full_specification = cloud_private_specification
+ public_cloud_name = cloud_private_specification.get(CLOUDS_PUBLIC_NAME_KEY)
+ if public_cloud_name and clouds_public:
+ LOG.debug("Trying to find profile...")
+ cloud_public_specification = clouds_public.get(public_cloud_name)
+ if not cloud_public_specification:
+ LOG.warning("%s is not a valid profile name. "
+ "Must be contained under key '%s'", public_cloud_name, CLOUD_PUBLIC_ROOT_KEY)
+ else:
+ LOG.debug("Profile found. Merging begins...")
+ try:
+ mergedeep.merge(cloud_full_specification, cloud_public_specification,
+ strategy=mergedeep.Strategy.TYPESAFE_REPLACE)
+ except TypeError as exc:
+ LOG.warning("Existing %s and %s configuration keys don't match in type: %s",
+ CLOUDS_YAML, CLOUDS_PUBLIC_YAML, exc)
+ return {}
+ else:
+ LOG.debug("Using only clouds.yaml since no clouds-public profile is set.")
+ else:
+ LOG.warning("%s is not a valid cloud name. Must be contained under key '%s'", cloud_name, CLOUD_ROOT_KEY)
+ return cloud_full_specification
+
+
+def get_cloud_specifications(configurations):
+ """
+ Calls get_cloud_specification to get the cloud_specification for every configuration
+ @param configurations:
+ @return: list of dicts: cloud_specifications of every configuration
+ """
+ clouds, clouds_public = get_clouds_files()
+ cloud_specifications = []
+ if isinstance(clouds, dict):
+ for configuration in configurations:
+ cloud = configuration.get(CLOUD_CONFIGURATION_KEY)
+ if cloud:
+ cloud_specifications.append(get_cloud_specification(cloud, clouds, clouds_public)) # might be None
+ return cloud_specifications
diff --git a/bibigrid2/core/utility/handler/logging_path_handler.py b/bibigrid2/core/utility/handler/logging_path_handler.py
new file mode 100644
index 000000000..420314520
--- /dev/null
+++ b/bibigrid2/core/utility/handler/logging_path_handler.py
@@ -0,0 +1,18 @@
+"""
+This module holds methods to return the logfile's path.
+"""
+
+import logging
+
+LOG = logging.getLogger("bibigrid")
+
+def get_logging_path():
+ """
+ Returns the path were the logfile is stored
+ @return: the path were the logfile is stored
+ """
+ for handler in LOG.getLoggerClass().root.handlers:
+ if hasattr(handler, 'baseFilename'):
+ log_path = handler.baseFilename
+ return log_path
+ return None
diff --git a/bibigrid2/core/utility/handler/provider_handler.py b/bibigrid2/core/utility/handler/provider_handler.py
new file mode 100644
index 000000000..45434505e
--- /dev/null
+++ b/bibigrid2/core/utility/handler/provider_handler.py
@@ -0,0 +1,64 @@
+"""
+This module contains different selectors to pick and create a connection to the right provider.
+"""
+
+import logging
+
+from bibigrid2.core.utility.handler import configuration_handler
+from bibigrid2.openstack import openstack_provider
+
+PROVIDER_NAME_DICT = {"openstack": openstack_provider.OpenstackProvider}
+PROVIDER_CLASS_DICT = {provider.__name__: provider for provider in PROVIDER_NAME_DICT.values()}
+LOG = logging.getLogger("bibigrid")
+
+def get_provider_by_class_name(provider_name, provider_dict=PROVIDER_CLASS_DICT): # pylint: disable=dangerous-default-value
+ """
+ Returns provider that is associated with the key provider_name in provider_dict.
+ Otherwise a KeyError is thrown.
+ :param provider_name: key of provider_dict
+ :return: provider
+ """
+ return provider_dict[provider_name]
+
+
+def get_provider_by_name(provider_name, provider_dict=PROVIDER_NAME_DICT): # pylint: disable=dangerous-default-value
+ """
+ Returns provider that is associated with the key provider_name in provider_dict.
+ Otherwise a KeyError is thrown.
+ :param provider_name: key of provider_dict
+ :return: provider
+ """
+ return provider_dict.get(provider_name)
+
+
+def get_provider_list_by_name_list(provider_name_list, cloud_specifications):
+ """
+ Returns provider list for given provider_name_list
+ If name is not found in PROVIDER_NAME_DICT, PROVIDER_CLASS_DICT is tried instead.
+ If not found in both a key error is thrown.
+ :param provider_name_list: list of provider names
+ :param cloud_specifications: list of cloud specifications
+ :return: list of providers
+ """
+ provider_list = [
+ (get_provider_by_name(provider_name) or get_provider_by_class_name(provider_name))(cloud_specification)
+ for provider_name, cloud_specification in zip(provider_name_list, cloud_specifications)]
+ return provider_list
+
+
+def get_providers(configurations):
+ """
+ Reads list of provider_names from configurations.
+ Determines list of providers by provider_names and returns it.
+ If providers don't match a key error is thrown and the program exits with failure state 1.
+ :param configurations:
+ :return:
+ """
+ cloud_specifications = configuration_handler.get_cloud_specifications(configurations)
+ if cloud_specifications:
+ try:
+ provider_names = configuration_handler.get_list_by_key(configurations, "infrastructure")
+ return get_provider_list_by_name_list(provider_names, cloud_specifications)
+ except KeyError as exc:
+ LOG.warning("Check infrastructure in configurations! Key: %s", str(exc))
+ return None
diff --git a/bibigrid2/core/utility/handler/ssh_handler.py b/bibigrid2/core/utility/handler/ssh_handler.py
new file mode 100644
index 000000000..c0c6f152d
--- /dev/null
+++ b/bibigrid2/core/utility/handler/ssh_handler.py
@@ -0,0 +1,229 @@
+"""
+This module handles ssh and sftp connections to master and vpnwkrs. It also holds general execution routines used to
+setup the Cluster.
+"""
+
+import logging
+import os
+import time
+import socket
+import paramiko
+import yaml
+
+from bibigrid2.models.exceptions import ConnectionException, ExecutionException
+from bibigrid2.core.utility import ansible_commands as aC
+
+PRIVATE_KEY_FILE = ".ssh/id_ecdsa" # to name bibigrid-temp keys identically on remote
+ANSIBLE_SETUP = [aC.NO_UPDATE, aC.UPDATE,
+ aC.PYTHON3_PIP, aC.ANSIBLE_PASSLIB,
+ (f"chmod 600 {PRIVATE_KEY_FILE}","Adjust private key permissions."),
+ aC.PLAYBOOK_HOME,
+ aC.PLAYBOOK_HOME_RIGHTS,
+ aC.ADD_PLAYBOOK_TO_LINUX_HOME]
+# ANSIBLE_START = [aC.WAIT_READY, aC.UPDATE, aC.MV_ANSIBLE_CONFIG, aC.EXECUTE] # another UPDATE seems to not necessary.
+ANSIBLE_START = [aC.WAIT_READY, aC.MV_ANSIBLE_CONFIG, aC.EXECUTE]
+VPN_SETUP = ["echo Example"]
+LOG = logging.getLogger("bibigrid")
+
+
+def get_ac_command(master_provider, name):
+ """
+ Get command to write application credentials to remote (
+ @param master_provider: provider that holds the master
+ @param name: how the application credential shall be called
+ @return: command to execute on remote to create application credential
+ """
+ master_cloud_specification = master_provider.cloud_specification
+ auth = master_cloud_specification["auth"]
+ ac_clouds_yaml = {"clouds": {"master": None}}
+ if auth.get("application_credential_id") and auth.get("application_credential_secret"):
+ wanted_keys = ["auth", "region_name", "interface", "identity_api_version", "auth_type"]
+ ac_cloud_specification = {k: master_cloud_specification[k] for k in wanted_keys if k in
+ master_cloud_specification}
+ else:
+ wanted_keys = ["region_name", "interface", "identity_api_version"]
+ ac = master_provider.create_application_credential(name=name) # pylint: disable=invalid-name
+ ac_dict = {"application_credential_id": ac["id"], "application_credential_secret": ac["secret"],
+ "auth_type": "v3applicationcredential", "auth_url": auth["auth_url"]}
+ ac_cloud_specification = {k: master_cloud_specification[k] for k in wanted_keys if k in
+ master_cloud_specification}
+ ac_cloud_specification.update(ac_dict)
+ ac_clouds_yaml["clouds"]["master"] = ac_cloud_specification
+ return (f"echo '{yaml.safe_dump(ac_clouds_yaml)}' | sudo install -D /dev/stdin /etc/openstack/clouds.yaml",
+ "Copy application credentials.")
+
+
+def get_add_ssh_public_key_commands(ssh_public_key_files):
+ """
+ Builds and returns the necessary commands to add given public keys to remote for additional access.
+ :param ssh_public_key_files: public keys to add
+ :return: list of public key add commands
+ """
+ commands = []
+ if ssh_public_key_files:
+ for ssh_public_key_file in ssh_public_key_files:
+ with open(ssh_public_key_file, mode="r", encoding="UTF-8") as ssh_public_key:
+ commands.append((f"echo {ssh_public_key.readline().strip()} >> .ssh/authorized_keys",
+ f"Add SSH Key {ssh_public_key_file}."))
+ return commands
+
+
+def copy_to_server(sftp, localpath, remotepath):
+ """
+ Recursively copies files and folders to server.
+ If a folder is given as localpath, the structure within will be kept.
+ :param sftp: sftp connection
+ :param localpath: file or folder locally
+ :param remotepath: file or folder locally
+ :return:
+ """
+ LOG.debug("Copy %s to %s...", localpath, remotepath)
+ if os.path.isfile(localpath):
+ sftp.put(localpath, remotepath)
+ else:
+ try:
+ sftp.mkdir(remotepath)
+ except OSError:
+ pass
+ for filename in os.listdir(localpath):
+ copy_to_server(sftp, localpath + "/" + filename, remotepath + "/" + filename)
+
+
+def is_active(client, floating_ip_address, private_key, username, timeout=5):
+ """
+ Checks if connection is possible and therefore if server is active.
+ Raises paramiko.ssh_exception.NoValidConnectionsError if timeout is reached
+ :param client: created client
+ :param floating_ip_address: ip to connect to
+ :param private_key: SSH-private_key
+ :param username: SSH-username
+ :param timeout: how long to wait between ping
+ (waiting grows quadratically till 2**timeout before accepting failure)
+ """
+ attempts = 0
+ establishing_connection = True
+ while establishing_connection:
+ try:
+ client.connect(hostname=floating_ip_address, username=username, pkey=private_key, timeout=5, auth_timeout=5)
+ establishing_connection = False
+ except paramiko.ssh_exception.NoValidConnectionsError as exc:
+ LOG.info(f"Attempting to connect to {floating_ip_address}... This might take a while", )
+ if attempts < timeout:
+ time.sleep(2 ** attempts)
+ attempts += 1
+ else:
+ LOG.error(f"Attempt to connect to {floating_ip_address} failed.")
+ raise ConnectionException(exc) from exc
+ except socket.timeout as exc:
+ LOG.warning("Socket timeout exception occurred. Try again ...")
+ if attempts < timeout:
+ attempts += 1
+ else:
+ LOG.error(f"Attempt to connect to {floating_ip_address} failed, due to a socket timeout.")
+ raise ConnectionException(exc) from exc
+ except TimeoutError as exc: # pylint: disable=duplicate-except
+ LOG.error("The attempt to connect to %s failed. Possible known reasons:"
+ "\n\t-Your network's security group doesn't allow SSH.", floating_ip_address)
+ raise ConnectionException(exc) from exc
+
+
+def line_buffered(f):
+ """
+ https://stackoverflow.com/questions/25260088/paramiko-with-continuous-stdout
+ temporary hangs?
+ :param f:
+ :return:
+ """
+ line_buf = b""
+ while not f.channel.exit_status_ready():
+
+ line_buf += f.read(1024)
+ if line_buf.endswith(b'\n'):
+ yield line_buf
+ line_buf = b''
+
+
+def execute_ssh_cml_commands(client, commands):
+ """
+ Executes commands and logs exit_status accordingly.
+ :param client: Client with connection to remote
+ :param commands: Commands to execute on remote
+ """
+ for command in commands:
+ ssh_stdin, ssh_stdout, ssh_stderr = client.exec_command(command[0]) # pylint: disable=unused-variable
+ ssh_stdout.channel.set_combine_stderr(True)
+ LOG.info(f"REMOTE: {command[1]}")
+
+ while True:
+ line = ssh_stdout.readline()
+ if len(line) == 0:
+ break
+ if "[BIBIGRID]" in line:
+ LOG.info(f"REMOTE: {line.strip()}")
+ else:
+ LOG.debug(f"REMOTE: {line.strip()}")
+
+ # get exit status
+ exit_status = ssh_stdout.channel.recv_exit_status()
+ # close handler
+ ssh_stdout.close()
+
+ if exit_status:
+ msg = f"{command[1]} ... Exit status: {exit_status}"
+ LOG.warning(msg)
+ raise ExecutionException(msg)
+
+
+def ansible_preparation(floating_ip, private_key, username, commands=None, filepaths=None):
+ """
+ Installs python and pip. Then installs ansible over pip.
+ Copies private key to instance so cluster-nodes are reachable and sets permission as necessary.
+ Copies additional files and executes additional commands if given.
+ The playbook is copied later, because it needs all servers setup and is not time intensive.
+ See: create.update_playbooks
+ :param floating_ip: public ip of server to ansible-prepare
+ :param private_key: generated private key of all cluster-server
+ :param username: username of all server
+ :param commands: additional commands to execute
+ :param filepaths: additional files to copy: (localpath, remotepath)
+ """
+ if filepaths is None:
+ filepaths = []
+ if commands is None:
+ commands = []
+ LOG.info("Ansible preparation...")
+ commands = ANSIBLE_SETUP + commands
+ filepaths.append((private_key, PRIVATE_KEY_FILE))
+ execute_ssh(floating_ip, private_key, username, commands, filepaths)
+
+
+def execute_ssh(floating_ip, private_key, username, commands=None, filepaths=None):
+ """
+ Executes commands on remote and copies files given in filepaths
+ :param floating_ip: public ip of remote
+ :param private_key: key of remote
+ :param username: username of remote
+ :param commands: commands
+ :param filepaths: filepaths (localpath, remotepath)
+ """
+ if commands is None:
+ commands = []
+ paramiko_key = paramiko.ECDSAKey.from_private_key_file(private_key)
+ with paramiko.SSHClient() as client:
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ try:
+ is_active(client=client,
+ floating_ip_address=floating_ip,
+ username=username,
+ private_key=paramiko_key)
+ except ConnectionException as exc:
+ LOG.error(f"Couldn't connect to floating ip {floating_ip} using private key {private_key}.")
+ raise exc
+ else:
+ if filepaths:
+ sftp = client.open_sftp()
+ for localpath, remotepath in filepaths:
+ copy_to_server(sftp=sftp, localpath=localpath, remotepath=remotepath)
+ LOG.debug("SFTP: Files %s copied.", filepaths)
+ if commands:
+ execute_ssh_cml_commands(client, commands)
diff --git a/bibigrid2/core/utility/id_generation.py b/bibigrid2/core/utility/id_generation.py
new file mode 100644
index 000000000..24d6718e5
--- /dev/null
+++ b/bibigrid2/core/utility/id_generation.py
@@ -0,0 +1,59 @@
+"""
+Generates ids and munge keys
+"""
+
+import shortuuid
+
+from bibigrid2.core.actions import create
+
+MAX_ID_LENGTH = 15
+CLUSTER_UUID_ALPHABET = '0123456789abcdefghijkmnopqrstuvwxyz'
+
+
+def generate_cluster_id():
+ """
+ Generates an encrypted shortUUID with length MAX_ID_LENGTH
+ :return:
+ """
+ uuid = shortuuid.ShortUUID()
+ uuid.set_alphabet(CLUSTER_UUID_ALPHABET)
+ return uuid.random(MAX_ID_LENGTH)
+
+
+def generate_safe_cluster_id(providers):
+ """
+ Generates a cluster_id and checks if cluster_id is not in use. When a unique id is found it is returned
+ :param providers: providers to check whether they use said cluster_id
+ :return: cluster_id
+ """
+ id_is_unique = False
+ cluster_id = None
+ while not id_is_unique:
+ cluster_id = generate_cluster_id()
+ id_is_unique = is_unique_cluster_id(cluster_id, providers)
+ return cluster_id
+
+
+def is_unique_cluster_id(cluster_id, providers):
+ """
+ Checks if cluster_id is not in use on any provider
+ :param cluster_id: generated cluster_ird
+ :param providers: providers to check
+ :return: True if cluster_id is unique. False else.
+ """
+ for provider in providers:
+ for server in provider.list_servers():
+ master = create.MASTER_IDENTIFIER(cluster_id=cluster_id)
+ vpnwkr = create.VPN_WORKER_IDENTIFIER(cluster_id=cluster_id)
+ worker = create.WORKER_IDENTIFIER(cluster_id=cluster_id)
+ if server["name"] in [master, vpnwkr, worker]:
+ return False
+ return True
+
+
+def generate_munge_key():
+ """
+ Generates a munge key (UUID) for slurm
+ :return:
+ """
+ return shortuuid.ShortUUID().random(32)
diff --git a/bibigrid2/core/utility/paths/ansible_resources_path.py b/bibigrid2/core/utility/paths/ansible_resources_path.py
new file mode 100644
index 000000000..d48336568
--- /dev/null
+++ b/bibigrid2/core/utility/paths/ansible_resources_path.py
@@ -0,0 +1,54 @@
+"""
+Paths that are used by Ansible. Especially playbook, vars files and Co.
+"""
+
+import os
+
+import bibigrid2.core.utility.paths.basic_path as bP
+
+# UNIVERSAL
+ANSIBLE_HOSTS: str = "ansible_hosts"
+COMMON_YML: str = "common.yml"
+SITE_YML: str = "site.yml"
+REQUIREMENTS_YML: str = "requirements.yml"
+UPLOAD_PATH: str = "/tmp/roles/"
+VARS_PATH: str = "vars/"
+ROLES_PATH: str = "roles/"
+LOGIN_YML: str = VARS_PATH + "login.yml"
+INSTANCES_YML: str = VARS_PATH + "instances.yml"
+CONFIG_YML: str = VARS_PATH + "common_configuration.yml"
+WORKER_SPECIFICATION_YML: str = VARS_PATH + "worker_specification.yml"
+ADDITIONAL_ROLES_PATH: str = ROLES_PATH + "additional/"
+DEFAULT_IP_FILE = VARS_PATH + "{{ ansible_default_ipv4.address }}.yml"
+# ANSIBLE_CFG = "ansible.cfg"
+
+# LOCAL
+# ANSIBLE_CFG_PATH = os.path.join(bP.RESOURCES_PATH, ANSIBLE_CFG)
+PLAYBOOK = "playbook/"
+PLAYBOOK_PATH: str = os.path.join(bP.RESOURCES_PATH, PLAYBOOK)
+HOSTS_CONFIG_FILE: str = PLAYBOOK_PATH + ANSIBLE_HOSTS
+CONFIG_ROOT_PATH: str = PLAYBOOK_PATH + VARS_PATH
+ROLES_ROOT_PATH: str = PLAYBOOK_PATH + ROLES_PATH
+COMMONS_LOGIN_FILE: str = PLAYBOOK_PATH + LOGIN_YML
+COMMONS_INSTANCES_FILE: str = PLAYBOOK_PATH + INSTANCES_YML
+COMMONS_CONFIG_FILE: str = PLAYBOOK_PATH + CONFIG_YML
+SITE_CONFIG_FILE: str = PLAYBOOK_PATH + SITE_YML
+WORKER_SPECIFICATION_FILE: str = PLAYBOOK_PATH + WORKER_SPECIFICATION_YML
+ADDITIONAL_ROLES_ROOT_PATH: str = ROLES_ROOT_PATH + ADDITIONAL_ROLES_PATH
+VARS_FOLDER = os.path.join(PLAYBOOK_PATH, VARS_PATH)
+
+# REMOTE
+ROOT_PATH_REMOTE = "~"
+PLAYBOOK_PATH_REMOTE: str = os.path.join("/opt/", PLAYBOOK)
+# PLAYBOOK_PATH_REMOTE: str = os.path.join(ROOT_PATH_REMOTE, PLAYBOOK)
+# PLAYBOOK_PATH_REMOTE_SLURM: str = os.path.join("/opt/slurm/", PLAYBOOK)
+HOSTS_CONFIG_FILE_REMOTE: str = PLAYBOOK_PATH_REMOTE + ANSIBLE_HOSTS
+CONFIG_ROOT_PATH_REMOTE: str = PLAYBOOK_PATH_REMOTE + VARS_PATH
+ROLES_ROOT_PATH_REMOTE: str = PLAYBOOK_PATH_REMOTE + ROLES_PATH
+COMMONS_LOGIN_FILE_REMOTE: str = PLAYBOOK_PATH_REMOTE + LOGIN_YML
+COMMONS_INSTANCES_FILE_REMOTE: str = PLAYBOOK_PATH_REMOTE + INSTANCES_YML
+COMMONS_CONFIG_FILE_REMOTE: str = PLAYBOOK_PATH_REMOTE + CONFIG_YML
+SITE_CONFIG_FILE_REMOTE: str = PLAYBOOK_PATH_REMOTE + SITE_YML
+WORKER_SPECIFICATION_FILE_REMOTE: str = PLAYBOOK_PATH_REMOTE + WORKER_SPECIFICATION_YML
+ADDITIONAL_ROLES_ROOT_PATH_REMOTE: str = ROLES_ROOT_PATH + ADDITIONAL_ROLES_PATH
+REQUIREMENTS_CONFIG_FILE_REMOTE: str = ADDITIONAL_ROLES_ROOT_PATH_REMOTE + REQUIREMENTS_YML
diff --git a/bibigrid2/core/utility/paths/basic_path.py b/bibigrid2/core/utility/paths/basic_path.py
new file mode 100644
index 000000000..742fc0ed4
--- /dev/null
+++ b/bibigrid2/core/utility/paths/basic_path.py
@@ -0,0 +1,11 @@
+"""
+Module containing the most basic paths. Must stay at the same place relative to root.
+"""
+
+import os
+from pathlib import Path
+
+RESOURCES = "resources"
+# if the relative path from this file to resources is altered, the next line must be adapted or files will not be found.
+ROOT_PATH = Path(__file__).absolute().parents[4]
+RESOURCES_PATH = os.path.join(ROOT_PATH, RESOURCES)
diff --git a/bibigrid2/core/utility/paths/bin_path.py b/bibigrid2/core/utility/paths/bin_path.py
new file mode 100644
index 000000000..a99058dcb
--- /dev/null
+++ b/bibigrid2/core/utility/paths/bin_path.py
@@ -0,0 +1,13 @@
+"""
+Paths that are used by bin script copying
+"""
+
+
+import os
+
+import bibigrid2.core.utility.paths.basic_path as bP
+
+BIN: str = "bin/"
+BIN_PATH: str = os.path.join(bP.RESOURCES_PATH, BIN)
+
+BIN_PATH_REMOTE: str = BIN
diff --git a/bibigrid2/core/utility/validate_configuration.py b/bibigrid2/core/utility/validate_configuration.py
new file mode 100644
index 000000000..93662d1fe
--- /dev/null
+++ b/bibigrid2/core/utility/validate_configuration.py
@@ -0,0 +1,439 @@
+"""
+Validates configuration and cloud_specification
+"""
+
+import logging
+import os
+
+from bibigrid2.core.utility.handler import configuration_handler
+
+ACCEPTED_KEY_IDENTIFIERS = {"RSA": 4096, "ECDSA": 521, "ED25519": 256}
+LOG = logging.getLogger("bibigrid")
+
+def evaluate(check_name, check_result):
+ """
+ Logs check_resul as warning if failed and as success if succeeded.
+ :param check_name:
+ :param check_result:
+ :return:
+ """
+ if check_result:
+ LOG.info("Checking %s: Success", check_name)
+ else:
+ LOG.warning("Checking %s: Failure", check_name)
+ return check_result
+
+
+def check_provider_data(provider_data_list, provider_count):
+ """
+ Checks if all provider datas are unique and if enough providers are given
+ #ToDo for multiple cloud locations additional provider data needs to be added
+ :param provider_data_list: list of all provider data
+ :param provider_count: number of providers
+ :return: True if enough providers are given and all providers are unique
+ """
+ LOG.info("Checking provider names")
+ success = True
+ duplicates = []
+ seen = []
+ for elem in provider_data_list:
+ if elem in seen:
+ duplicates.append(elem)
+ else:
+ seen.append(elem)
+ if duplicates:
+ LOG.warning("Duplicate provider(s) %s. For each provider you can only create one configuration. "
+ "Please check your configurations.", duplicates)
+ success = False
+ else:
+ LOG.info("All providers are unique.")
+ if not len(provider_data_list) == provider_count:
+ LOG.warning("Not enough providers given. %s/%s", len(provider_data_list), provider_count)
+ success = False
+ else:
+ LOG.info("Enough providers given. %s/%s", len(provider_data_list), provider_count)
+ return success
+
+
+def evaluate_ssh_public_key_file_security(ssh_public_key_file):
+ """
+ Checks if key encryption is sufficiently strong. Uses empiric values and therefore will fail if key type is unknown
+ @param ssh_public_key_file:
+ @return:
+ """
+ success = True
+ # length, key, comment list, identifier_dirty
+ key_info = os.popen(f'ssh-keygen -l -f {ssh_public_key_file}').read().split()
+ length = key_info[0]
+ identifier_clean = key_info[-1].strip("()\n")
+ minimum_size = ACCEPTED_KEY_IDENTIFIERS.get(identifier_clean)
+
+ if not minimum_size:
+ LOG.warning("sshPublicKey '%s' is %s. Which secure length is unknown to bibigrid2.\n"
+ "Known encryptions are (with minimum size): %s",
+ ssh_public_key_file, identifier_clean, ACCEPTED_KEY_IDENTIFIERS)
+ else:
+ LOG.info("sshPublicKey '%s' is a known encryption.", ssh_public_key_file)
+ if minimum_size > int(length):
+ LOG.warning("sshPublicKey '%s' is not long enough! %s should be >= %s, but is %s",
+ ssh_public_key_file, identifier_clean, minimum_size, int(length))
+ else:
+ LOG.info("sshPublicKey '%s' is long enough (%s/%s)!", ssh_public_key_file, int(length), minimum_size)
+ return success
+
+
+def has_enough(maximum, needed, keeper, thing):
+ """
+ Method logs and compares whether enough free things are available
+ :param maximum: maximum (available) resources of thing
+ :param needed: minimum needed to run
+ :param keeper: description of the object having the thing that is checked (for logging)
+ :param thing: description of what resource is checked (RAM for example) (for logging)
+ :return: True if maximum is larger or equal to the needed
+ """
+ success = True
+ if maximum >= needed:
+ LOG.info("%s has enough %s: %s/%s", keeper, thing, needed, maximum)
+ elif maximum < 0:
+ LOG.warning("%s returns no valid value for %s: %s/%s -- Ignored.", keeper, thing, needed, maximum)
+ else:
+ LOG.warning("%s has not enough %s: %s/%s", keeper, thing, needed, maximum)
+ success = False
+ return success
+
+
+def check_clouds_yaml_security():
+ """
+ Checks security of all clouds in clouds.yaml i.e. whether sensitive information is stored in clouds-public.yaml
+ @return: True if no sensitive information is stored in clouds-public.yaml. False else.
+ """
+ success = True
+ LOG.info("Checking validity of entire clouds.yaml and clouds-public.yaml")
+ clouds, clouds_public = configuration_handler.get_clouds_files() # pylint: disable=unused-variable
+ if clouds_public:
+ for cloud in clouds_public:
+ if clouds_public[cloud].get("profile"):
+ LOG.warning(f"{cloud}: Profiles should be placed in clouds.yaml not clouds-public.yaml! "
+ f"Key ignored.")
+ success = False
+ if clouds_public[cloud].get("auth"):
+ for key in ["password", "username", "application_credential_id", "application_credential_secret"]:
+ if clouds_public[cloud]["auth"].get(key):
+ LOG.warning(f"{cloud}: {key} shouldn't be shared. Move {key} to clouds.yaml!")
+ success = False
+ return success
+
+
+def check_cloud_yaml(cloud_specification):
+ """
+ Check if cloud_specification is valid i.e. contains the necessary authentification data.
+ @param cloud_specification: dict to check whether it is a valid cloud_specification
+ @return: True if cloud_specification is valid. False else.
+ """
+ success = True
+ if cloud_specification:
+ keys = cloud_specification.keys()
+ auth = cloud_specification.get("auth")
+ if auth:
+ auth_keys = auth.keys()
+ if not ("password" in auth_keys and "username" in auth_keys) \
+ and not ("auth_type" in keys and "application_credential_id" in auth_keys and
+ "application_credential_secret" in auth_keys):
+ LOG.warning("Insufficient authentication information. Needs either password and username or "
+ "if using application credentials: "
+ "auth_type, application_credential_id and application_credential_secret.")
+ success = False
+ if "auth_url" not in auth_keys:
+ LOG.warning("Authentification URL auth_url is missing.")
+ success = False
+ else:
+ LOG.warning("Missing all auth information!")
+ success = False
+ if "region_name" not in keys:
+ LOG.warning("region_name is missing.")
+ success = False
+ else:
+ LOG.warning("Missing all cloud_specification information!")
+ return success
+
+
+class ValidateConfiguration:
+ """
+ This class contains necessary algorithms to validate configuration files
+ """
+
+ def __init__(self, configurations, providers):
+ """
+ Sets configurations, providers and prepares the required_resources_dict.
+ While executing the checks, needed resources are counted.
+ In the end check_quotas will decide whether enough resources are available.
+ :param configurations: List of configurations (dicts)
+ :param providers: List of providers
+ """
+ self.configurations = configurations
+ self.providers = providers
+ self.required_resources_dict = {'total_cores': 0, 'floating_ips': 0, 'instances': 0, 'total_ram': 0,
+ 'Volumes': 0, 'VolumeGigabytes': 0, 'Snapshots': 0, 'Backups': 0,
+ 'BackupGigabytes': 0}
+
+ def validate(self):
+ """
+ Validation of the configuration file with the selected cloud provider.
+ The validation steps are as follows:
+ Check connection can be established
+ Check provider uniqueness
+ Check servergroup
+ Check instances are available
+ Check images and volumes are available
+ Check network and subnet are available
+ Check quotas
+ :return:
+ """
+ success = bool(self.providers)
+ LOG.info("Validating config file...")
+ success = check_provider_data(
+ configuration_handler.get_list_by_key(self.configurations, "infrastructure"),
+ len(self.configurations)) and success
+ if not success:
+ LOG.warning("Providers not set correctly in configuration file. Check log for more detail.")
+ return success
+ checks = [("master/vpn", self.check_master_vpn_worker), ("servergroup", self.check_server_group),
+ ("instances", self.check_instances), ("volumes", self.check_volumes),
+ ("network", self.check_network), ("quotas", self.check_quotas),
+ ("sshPublicKeyFiles", self.check_ssh_public_key_files), ("cloudYamls", self.check_clouds_yamls)]
+ if success:
+ for check_name, check_function in checks:
+ success = evaluate(check_name, check_function()) and success
+ return success
+
+ def check_master_vpn_worker(self):
+ """
+ Checks if first configuration has a masterInstance defined
+ and every other configuration has a vpnInstance defined.
+ If one is missing said provider wouldn't be reachable over the cluster, because no floating IP would be given.
+ :return: True if first configuration has a masterInstance and every other a vpnInstance
+ """
+ LOG.info("Checking master/vpn")
+ success = True
+ if not self.configurations[0].get("masterInstance") or self.configurations[0].get("vpnInstance"):
+ success = False
+ for configuration in self.configurations[1:]:
+ if not configuration.get("vpnInstance") or configuration.get("masterInstance"):
+ success = False
+ return success
+
+ def check_provider_connections(self):
+ """
+ Checks if all providers are reachable
+ :return: True if all providers are reachable
+ """
+ success = True
+ providers_unconnectable = []
+ for provider in self.providers:
+ if not provider.conn:
+ providers_unconnectable.append(provider.name)
+ if providers_unconnectable:
+ LOG.warning("API connection to %s not successful. Please check your configuration.",
+ providers_unconnectable)
+ success = False
+ return success
+
+ def check_instances(self):
+ """
+ Checks if all instances exist and image and instance-type are compatible
+ :return: true if image and instance-type (flavor) exist for all instances and are compatible
+ """
+ LOG.info("Checking instance images and type")
+ success = True
+ configuration = None
+ try:
+ for configuration, provider in zip(self.configurations, self.providers):
+ self.required_resources_dict["floating_ips"] += 1
+ if configuration.get("masterInstance"):
+ success = self.check_instance("masterInstance", configuration["masterInstance"], provider) \
+ and success
+ else:
+ success = self.check_instance("vpnInstance", configuration["vpnInstance"], provider) and success
+ for worker in configuration.get("workerInstances", []):
+ success = self.check_instance("workerInstance", worker, provider) and success
+ except KeyError as exc:
+ LOG.warning("Not found %s, but required in configuration %s.", str(exc), configuration)
+ success = False
+ return success
+
+ def check_instance(self, instance_name, instance, provider):
+ """
+ Checks if instance image exists and whether it is compatible with the defined instance/server type (flavor).
+ :param instance_name: containing name for logging purposes
+ :param instance: dict containing image, type and count (count is not used)
+ :param provider: provider
+ :return: true if type and image compatible and existing
+ """
+ self.required_resources_dict["instances"] += instance.get("count") or 1
+ instance_image_id_or_name = instance["image"]
+ instance_image = provider.get_image_by_id_or_name(image_id_or_name=instance_image_id_or_name)
+ if not instance_image:
+ LOG.warning("Instance %s image: %s not found", instance_name, instance_image_id_or_name)
+ print("Available active images:")
+ print("\n".join(provider.get_active_images()))
+ return False
+ if instance_image["status"] != "active":
+ LOG.warning("Instance %s image: %s not active", instance_name, instance_image_id_or_name)
+ print("Available active images:")
+ print("\n".join(provider.get_active_images))
+ return False
+ LOG.info("Instance %s image: %s found", instance_name, instance_image_id_or_name)
+ instance_type = instance["type"]
+ return self.check_instance_type_image_combination(instance_type, instance_image, provider)
+
+ def check_instance_type_image_combination(self, instance_type, instance_image, provider):
+ """
+ Checks, if enough ram, disk space for instance_image are provided by instance_type on provider.
+ :param instance_type
+ :param instance_image
+ :param provider
+ :return true, if enough resources available
+ """
+ success = True
+ # check
+ flavor = provider.get_flavor(instance_type)
+ if not flavor:
+ LOG.warning("Flavor %s does not exist.", instance_type)
+ print("Available flavors:")
+ print("\n".join(provider.get_active_flavors()))
+ return False
+ type_max_disk_space = flavor["disk"]
+ type_max_ram = flavor["ram"]
+ image_min_disk_space = provider.get_image_by_id_or_name(instance_image)["min_disk"]
+ image_min_ram = provider.get_image_by_id_or_name(instance_image)["min_ram"]
+ for maximum, needed, thing in [(type_max_disk_space, image_min_disk_space, "disk space"),
+ (type_max_ram, image_min_ram, "ram")]:
+ success = has_enough(maximum, needed, f"Type {instance_type}", thing) and success
+ # prepare check quotas
+ self.required_resources_dict["total_ram"] += type_max_ram
+ self.required_resources_dict["total_cores"] += flavor["vcpus"]
+ return success
+
+ def check_volumes(self):
+ """
+ Checking if volume or snapshot exists for all volumes
+ :return: True if all snapshot and volumes are found. Else false.
+ """
+ LOG.info("Checking volumes...")
+ success = True
+ for configuration, provider in zip(self.configurations, self.providers):
+ volume_identifiers = configuration.get("masterMounts")
+ if volume_identifiers:
+ # check individually if volumes exist
+ for volume_identifier in volume_identifiers:
+ if ":" in volume_identifier:
+ volume_name_or_id = volume_identifier[:volume_identifier.index(":")]
+ else:
+ volume_name_or_id = volume_identifier
+ volume = provider.get_volume_by_id_or_name(volume_name_or_id)
+ if not volume:
+ snapshot = provider.get_volume_snapshot_by_id_or_name(volume_name_or_id)
+ if not snapshot:
+ LOG.warning("Neither Volume nor Snapshot '%s' found", volume_name_or_id)
+ success = False
+ else:
+ LOG.info("Snapshot '%s' found", volume_name_or_id)
+ self.required_resources_dict["Volumes"] += 1
+ self.required_resources_dict["VolumeGigabytes"] += snapshot["size"]
+ else:
+ LOG.info(f"Volume '{volume_name_or_id}' found")
+ return success
+
+ def check_network(self):
+ """
+ Check if network (or subnet) is accessible
+ :return True if any given network or subnet is accessible by provider
+ """
+ LOG.info("Checking network...")
+ success = True
+ for configuration, provider in zip(self.configurations, self.providers):
+ network_name_or_id = configuration.get("network")
+ if network_name_or_id:
+ network = provider.get_network_by_id_or_name(network_name_or_id)
+ if not network:
+ LOG.warning(f"Network '{network_name_or_id}' not found", network_name_or_id)
+ success = False
+ else:
+ LOG.info(f"Network '{subnet_name_or_id}' found")
+ subnet_name_or_id = configuration.get("subnet")
+ if subnet_name_or_id:
+ subnet = provider.get_subnet_by_id_or_name(subnet_name_or_id)
+ if not subnet:
+ LOG.warning(f"Subnet '{subnet_name_or_id}' not found")
+ success = False
+ else:
+ LOG.info(f"Subnet '{subnet_name_or_id}' found")
+ return bool(success and (network_name_or_id or subnet_name_or_id))
+
+ def check_server_group(self):
+ """
+ :return: True if server group accessible
+ """
+ success = True
+ for configuration, provider in zip(self.configurations, self.providers):
+ server_group_name_or_id = configuration.get("serverGroup")
+ if server_group_name_or_id:
+ server_group = provider.get_server_group_by_id_or_name(server_group_name_or_id)
+ if not server_group:
+ LOG.warning("ServerGroup '%s' not found", server_group_name_or_id)
+ success = False
+ else:
+ LOG.info("ServerGroup '%s' found", server_group_name_or_id)
+ return success
+
+ def check_quotas(self):
+ """
+ Gets remaining resources from the provider and compares them to the needed resources.
+ Needed resources are set during the other checks.
+ Covered resources are: cores, floating_ips, instances, ram, volumes, volumeGigabytes, snapshots, backups and
+ backupGigabytes. If a concrete provider implementation is unable to return remaining resources a maximum number
+ is returned to make the check not fail because of the missing API implementation.
+ :return: True if check succeeded. Else false.
+ """
+ LOG.info("Checking quotas")
+ success = True
+ LOG.info("required/available")
+ for provider in self.providers:
+ free_resources_dict = provider.get_free_resources()
+ for key, value in self.required_resources_dict.items():
+ success = has_enough(free_resources_dict[key],
+ value,
+ f"Project {self.providers[0].cloud_specification['identifier']}",
+ key) and success
+ return success
+
+ def check_ssh_public_key_files(self):
+ """
+ Checks if keys listed in the config exist
+ :return: True if check succeeded. Else false.
+ """
+ success = True
+ for configuration in self.configurations:
+ for ssh_public_key_file in configuration.get("sshPublicKeyFiles") or []:
+ if not os.path.isfile(ssh_public_key_file):
+ LOG.warning("sshPublicKeyFile '%s' not found", ssh_public_key_file)
+ success = False
+ else:
+ LOG.info("sshPublicKeyFile '%s' found", ssh_public_key_file)
+ success = evaluate_ssh_public_key_file_security(ssh_public_key_file) and success
+ return success
+
+ def check_clouds_yamls(self):
+ """
+ Checks if every cloud in clouds_yaml is valid
+ @return: True if all clouds are valid
+ """
+ LOG.info("Checking cloud specifications...")
+ success = True
+ cloud_specifications = configuration_handler.get_cloud_specifications(self.configurations)
+ for index, cloud_specification in enumerate(cloud_specifications):
+ if not check_cloud_yaml(cloud_specification):
+ success = False
+ LOG.warning("Cloud specification %s is faulty. BiBiGrid understood %s.", index, cloud_specification)
+ success = check_clouds_yaml_security() and success
+ return success
diff --git a/bibigrid2/core/utility/yaml_dumper.py b/bibigrid2/core/utility/yaml_dumper.py
new file mode 100644
index 000000000..b301f045a
--- /dev/null
+++ b/bibigrid2/core/utility/yaml_dumper.py
@@ -0,0 +1,15 @@
+"""
+Alternative version of yaml.SafeDumper that ignores aliases.
+"""
+
+import yaml
+
+
+class NoAliasSafeDumper(yaml.SafeDumper):
+ """
+ Only difference to the regular yaml.SafeDumper class is that ignore_aliases is true
+ and therefore aliases are ignored.
+ """
+
+ def ignore_aliases(self, data):
+ return True
diff --git a/bibigrid2/models/exceptions.py b/bibigrid2/models/exceptions.py
new file mode 100644
index 000000000..9691e4728
--- /dev/null
+++ b/bibigrid2/models/exceptions.py
@@ -0,0 +1,9 @@
+""" module for additional exceptions """
+
+
+class ConnectionException(Exception):
+ """ Connection exception. """
+
+
+class ExecutionException(Exception):
+ """ Execution exception. """
diff --git a/bibigrid2/models/return_threading.py b/bibigrid2/models/return_threading.py
new file mode 100644
index 000000000..a7c7a1b43
--- /dev/null
+++ b/bibigrid2/models/return_threading.py
@@ -0,0 +1,31 @@
+"""
+Expands threading.
+"""
+
+import threading
+
+
+class ReturnThread(threading.Thread):
+ """
+ Extends the Thread functionality:
+ - Return value of called function is returned by join()
+ - An exception occurred within the called function is raised by join()
+ """
+
+ def __init__(self, group=None, target=None, name=None, args=(), kwargs={}): # pylint: disable=dangerous-default-value
+ threading.Thread.__init__(self, group, target, name, args, kwargs)
+ self._return = None
+ self._exc = None
+
+ def run(self):
+ if self._target is not None:
+ try:
+ self._return = self._target(*self._args, **self._kwargs)
+ except Exception as exc: # pylint: disable=broad-except
+ self._exc = exc
+
+ def join(self, *args):
+ threading.Thread.join(self, *args)
+ if self._exc:
+ raise self._exc
+ return self._return
diff --git a/bibigrid2/openstack/openstack_provider.py b/bibigrid2/openstack/openstack_provider.py
new file mode 100644
index 000000000..c41fce210
--- /dev/null
+++ b/bibigrid2/openstack/openstack_provider.py
@@ -0,0 +1,260 @@
+"""
+Concrete implementation of provider.py for openstack
+"""
+
+import logging
+
+import keystoneclient
+import openstack
+from cinderclient import client
+from keystoneauth1 import session
+from keystoneauth1.exceptions.http import NotFound
+from keystoneauth1.identity import v3
+
+from bibigrid2.core import provider
+from bibigrid2.core.actions import create
+from bibigrid2.core.actions import version
+from bibigrid2.models.exceptions import ExecutionException
+
+LOG = logging.getLogger("bibigrid")
+
+
+class OpenstackProvider(provider.Provider): # pylint: disable=too-many-public-methods
+ """
+ Specific implementation of the Provider class for openstack
+ """
+ NAME = "OpenstackProvider"
+
+ # to be read from clouds.yaml file.
+
+ def __init__(self, cloud_specification):
+ super().__init__(cloud_specification)
+ self.conn = self.create_connection()
+ sess = self.create_session()
+ self.keystone_client = keystoneclient.client.Client(session=sess, interface='public')
+ self.cinder = client.Client(3, session=sess)
+
+ def create_session(self, app_name="openstack_scripts", app_version="1.0"):
+ """
+ Creates and returns a session that can be used to create a connection to different openstack services
+ @param app_name:
+ @param app_version:
+ @return: session
+ """
+ # print(v3)
+ auth = self.cloud_specification["auth"]
+ if all(key in auth for key in ["auth_url", "application_credential_id", "application_credential_secret"]):
+ auth_session = v3.ApplicationCredential(
+ auth_url=auth["auth_url"],
+ application_credential_id=auth["application_credential_id"],
+ application_credential_secret=auth["application_credential_secret"]
+ )
+ elif all(key in auth for key in ["auth_url", "username", "password", "project_id", "user_domain_name"]):
+ auth_session = v3.Password(auth_url=auth["auth_url"],
+ username=auth["username"],
+ password=auth["password"],
+ project_id=auth["project_id"],
+ user_domain_name=auth["user_domain_name"])
+ else:
+ raise KeyError("Not enough authentication information in clouds.yaml/clouds-public.yaml "
+ "to create a session. Use one:\n"
+ "Application Credentials: auth_url, application_credential_id and "
+ "application_credential_secret\n"
+ "Password: auth_url, username, password, project_id and user_domain_name")
+ return session.Session(auth=auth_session,
+ app_name=app_name, app_version=app_version)
+
+ def create_connection(self, app_name="openstack_bibigrid", app_version=version.__version__):
+ auth = self.cloud_specification["auth"]
+ return openstack.connect(
+ load_yaml_config=False,
+ load_envvars=False,
+ auth_url=auth["auth_url"],
+ project_name=auth.get("project_name"),
+ username=auth.get("username"),
+ password=auth.get("password"),
+ region_name=self.cloud_specification["region_name"],
+ user_domain_name=auth.get("user_domain_name"),
+ project_domain_name=auth.get("user_domain_name"),
+ app_name=app_name,
+ app_version=app_version,
+ application_credential_id=auth.get("application_credential_id"),
+ application_credential_secret=auth.get("application_credential_secret"),
+ interface=self.cloud_specification.get("interface"),
+ identity_api_version=self.cloud_specification.get("identity_api_version"),
+ auth_type=self.cloud_specification.get("auth_type")
+ )
+
+ def create_application_credential(self, name=None):
+ return self.keystone_client.application_credentials.create(name=name).to_dict()
+
+ def delete_application_credential_by_id_or_name(self, ac_id_or_name):
+ """
+ Deletes existing application credential by id or name and returns true.
+ If application credential not found it returns false.
+ :param ac_id_or_name: application credential id or name
+ :return: True if deleted else false
+ """
+ try:
+ self.keystone_client.application_credentials.delete(ac_id_or_name) # id
+ return True
+ except NotFound:
+ try:
+ self.keystone_client.application_credentials.delete(
+ self.keystone_client.application_credentials.find(name=ac_id_or_name)) # name
+ return True
+ except NotFound:
+ return False
+
+ def get_image_by_id_or_name(self, image_id_or_name):
+ return self.conn.get_image(name_or_id=image_id_or_name)
+
+ def get_flavor(self, instance_type):
+ return self.conn.get_flavor(instance_type)
+
+ def get_volume_snapshot_by_id_or_name(self, snapshot_id_or_name):
+ return self.conn.get_volume_snapshot(name_or_id=snapshot_id_or_name)
+
+ def get_network_by_id_or_name(self, network_id_or_name):
+ return self.conn.get_network(name_or_id=network_id_or_name)
+
+ def get_subnet_by_id_or_name(self, subnet_id_or_name):
+ return self.conn.get_subnet(name_or_id=subnet_id_or_name)
+
+ def list_servers(self):
+ return [elem.toDict() for elem in self.conn.list_servers()]
+
+ def create_server(self, name, flavor, image,
+ network, key_name=None, wait=True, volumes=None):
+ try:
+ server = self.conn.create_server(name=name, flavor=flavor, image=image,
+ network=network, key_name=key_name, volumes=volumes)
+ except openstack.exceptions.BadRequestException as exc:
+ raise ConnectionError() from exc
+ except openstack.exceptions.SDKException as exc:
+ raise ExecutionException() from exc
+ except AttributeError as exc:
+ raise ExecutionException("Unable to create server due to faulty configuration.") from exc
+ if wait:
+ self.conn.wait_for_server(server=server, auto_ip=False, timeout=600)
+ server = self.conn.get_server(server["id"])
+ return server
+
+ def delete_server(self, name_or_id, delete_ips=True):
+ """
+ Deletes server. floating_ip as well if delete_ips is true. The resources are then free again
+ :param name_or_id:
+ :param delete_ips:
+ :return:
+ """
+ return self.conn.delete_server(name_or_id=name_or_id, wait=False,
+ timeout=180, delete_ips=delete_ips,
+ delete_ip_retry=1)
+
+ def delete_keypair(self, key_name):
+ return self.conn.delete_keypair(key_name)
+
+ def get_server_group_by_id_or_name(self, server_group_id_or_name):
+ return self.conn.get_server_group(name_or_id=server_group_id_or_name)
+
+ def close(self):
+ return self.conn.close()
+
+ def create_keypair(self, name, public_key):
+ return self.conn.create_keypair(name=name, public_key=public_key)
+
+ def get_network_id_by_subnet(self, subnet):
+ subnet = self.conn.get_subnet(subnet)
+ return subnet["network_id"] if subnet else subnet
+
+ def get_subnet_ids_by_network(self, network):
+ network = self.conn.get_network(network)
+ return network["subnets"] if network else network
+
+ def get_free_resources(self):
+ """
+ Uses the cinder API to get all relevant volume resources.
+ https://github.com/openstack/python-cinderclient/blob/master/cinderclient/v3/limits.py
+ Uses the nova API to get all relevant compute resources. Floating-IP is not returned correctly by openstack.
+ :return: Dictionary containing the free resources
+ """
+ compute_limits = dict(self.conn.compute.get_limits()["absolute"])
+ # maybe needs limits.get(os.environ["OS_PROJECT_NAME"]) in the future
+ volume_limits_generator = self.cinder.limits.get().absolute
+ volume_limits = {absolut_limit.name: absolut_limit.value for absolut_limit in
+ volume_limits_generator}
+ # ToDo TotalVolumeGigabytes needs totalVolumeGigabytesUsed, but is not given
+ volume_limits["totalVolumeGigabytesUsed"] = 0
+ free_resources = {}
+ for key in ["total_cores", "floating_ips", "instances", "total_ram"]:
+ free_resources[key] = compute_limits[key] - compute_limits[key + "_used"]
+ for key in ["Volumes", "VolumeGigabytes", "Snapshots", "Backups", "BackupGigabytes"]:
+ free_resources[key] = volume_limits["maxTotal" + key] - volume_limits[
+ "total" + key + "Used"]
+ return free_resources
+
+ def get_volume_by_id_or_name(self, name_or_id):
+ return self.conn.get_volume(name_or_id)
+
+ def create_volume_from_snapshot(self, snapshot_name_or_id):
+ """
+ Uses the cinder API to create a volume from snapshot:
+ https://github.com/openstack/python-cinderclient/blob/master/cinderclient/v3/volumes.py
+ :param snapshot_name_or_id: name or id of snapshot
+ :return: id of created volume
+ """
+ LOG.debug("Trying to create volume from snapshot")
+ snapshot = self.conn.get_volume_snapshot(snapshot_name_or_id)
+ if snapshot:
+ LOG.debug(f"Snapshot {snapshot_name_or_id} found.")
+ if snapshot["status"] == "available":
+ LOG.debug("Snapshot %s is available.", {snapshot_name_or_id})
+ size = snapshot["size"]
+ name = create.PREFIX_WITH_SEP + snapshot["name"]
+ description = f"Created from snapshot {snapshot_name_or_id} by BiBiGrid"
+ volume = self.cinder.volumes.create(size=size, snapshot_id=snapshot["id"], name=name,
+ description=description)
+ return volume.to_dict()["id"]
+ LOG.warning("Snapshot %s is %s; must be available.", snapshot_name_or_id, snapshot['status'])
+ else:
+ LOG.warning("Snapshot %s not found.", snapshot_name_or_id)
+ return None
+
+ def get_external_network(self, network_name_or_id):
+ """
+ Finds router interface with network id equal to given network and by that the external network.
+ :param network_name_or_id:Name or id of network
+ :return:Corresponding external network
+ """
+ network_id = self.conn.get_network(network_name_or_id)["id"]
+ for router in self.conn.list_routers():
+ for interface in self.conn.list_router_interfaces(router):
+ if interface.network_id == network_id:
+ return router.external_gateway_info["network_id"]
+ return None
+
+ def attach_available_floating_ip(self, network=None, server=None):
+ """
+ Get a floating IP from a network or a pool and attach it to the server
+ :param network:
+ :param server:
+ :return:
+ """
+ floating_ip = self.conn.available_floating_ip(network=network)
+ if server:
+ self.conn.compute.add_floating_ip_to_server(server, floating_ip["floating_ip_address"])
+ return floating_ip
+
+ def get_images(self):
+ """
+ Get a generator able ot generate all images
+ @return: A generator able ot generate all images
+ """
+ return self.conn.compute.images()
+
+ def get_flavors(self):
+ """
+ Get a generator able ot generate all flavors
+ @return: A generator able ot generate all flavors
+ """
+ return self.conn.compute.flavors()
diff --git a/documentation/images/actions.jpg b/documentation/images/actions.jpg
new file mode 100644
index 000000000..d09040272
Binary files /dev/null and b/documentation/images/actions.jpg differ
diff --git a/documentation/images/features/cloud_specification_data/ac_screen1.png b/documentation/images/features/cloud_specification_data/ac_screen1.png
new file mode 100644
index 000000000..4c9ab50fa
Binary files /dev/null and b/documentation/images/features/cloud_specification_data/ac_screen1.png differ
diff --git a/documentation/images/features/cloud_specification_data/ac_screen2.png b/documentation/images/features/cloud_specification_data/ac_screen2.png
new file mode 100644
index 000000000..1f4b25613
Binary files /dev/null and b/documentation/images/features/cloud_specification_data/ac_screen2.png differ
diff --git a/documentation/images/features/cloud_specification_data/ac_screen3.png b/documentation/images/features/cloud_specification_data/ac_screen3.png
new file mode 100644
index 000000000..f42a05f26
Binary files /dev/null and b/documentation/images/features/cloud_specification_data/ac_screen3.png differ
diff --git a/documentation/images/features/cloud_specification_data/pw_screen1.png b/documentation/images/features/cloud_specification_data/pw_screen1.png
new file mode 100644
index 000000000..7fb53987f
Binary files /dev/null and b/documentation/images/features/cloud_specification_data/pw_screen1.png differ
diff --git a/documentation/images/software/theia/theia.png b/documentation/images/software/theia/theia.png
new file mode 100644
index 000000000..419f27658
Binary files /dev/null and b/documentation/images/software/theia/theia.png differ
diff --git a/documentation/markdown/bibigrid_feature_list.md b/documentation/markdown/bibigrid_feature_list.md
new file mode 100644
index 000000000..c5cae99ea
--- /dev/null
+++ b/documentation/markdown/bibigrid_feature_list.md
@@ -0,0 +1,16 @@
+# BiBiGrid Features
+
+| Name | Purpose |
+|:----------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------:|
+| [Version](features/version.md) | Returns BiBiGrid's version for opening issues and the like |
+| [Terminate Cluster](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. |
+| [Create](features/create.md) | Creates the cluster specified by the configuration. |
+ | [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. |
+| [Check](features/check.md) | Checks if given configuration is valid and necessary security measures are taken. |
+| [Web IDE](features/ide.md) | Connects to running IDE of cluster-id's cluster. Requires that given cluster was setup with an ide. |
+| [Update](features/update.md) | Updates the master's playbook and runs that playbook for the master. Requires that no job is running and no workers up. |
+| [Cloud Specification Data](features/cloud_specification_data.md) | Contains necessary data to establish a general connection to the provider. |
+ | [Configuration](features/configuration.md) | Contains all data regarding cluster setup for all providers. |
+| [Command Line Interface](features/CLI.md) | What command line arguments can be passed into BiBiGrid. |
+
+![](../images/actions.jpg)
\ No newline at end of file
diff --git a/documentation/markdown/bibigrid_software_list.md b/documentation/markdown/bibigrid_software_list.md
new file mode 100644
index 000000000..d3c2d4d13
--- /dev/null
+++ b/documentation/markdown/bibigrid_software_list.md
@@ -0,0 +1,8 @@
+# BiBiGrid Used Software
+
+| Name | Purpose | Official Link |
+|:------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------:|
+| [Ansible](software/ansible.md) | Ansible, an open source community project by Red Hat, enables the idempotent setup of servers. Ansible is used to **prepare** all cluster nodes. | [Getting started with Ansible](https://docs.ansible.com/ansible/latest/getting_started/index.html) |
+| [Slurm](software/slurm.md) | Slurm is an open source cluster management and job scheduling system. Slurm is used to **schedule** cluster nodes i.e. Slurm will start and shutdown nodes as needed. | [Quick Start User Guide](https://slurm.schedmd.com/quickstart.html) |
+| [Theia IDE](software/theia_ide.md) | Theia IDE is a Web IDE, build using the Theia Framework, that allows easy, intuitive and abstract **web access** to cluster nodes. Theia IDE is optional. | [Using "Theia" as an End User](https://theia-ide.org/docs/user_getting_started/) |
+| [Zabbix](software/zabbix.md) | Zabbix is an open source **monitoring** solution for networks, servers, clouds, applications and services. Zabbix is optional. | [What is Zabbix](https://www.zabbix.com/documentation/current/en/manual/introduction/about) |
\ No newline at end of file
diff --git a/documentation/markdown/features/CLI.md b/documentation/markdown/features/CLI.md
new file mode 100644
index 000000000..baca937db
--- /dev/null
+++ b/documentation/markdown/features/CLI.md
@@ -0,0 +1,17 @@
+# CLI
+Available command line parameters:
+- `-h, --help` show help message and exit
+- `-v, --verbose` Increases output verbosity (can be of great use when cluster fails to start). `-v` adds more detailed info to the logfile, `-vv` adds debug information to the logfile.
+- `-d, --debug` Keeps cluster active in case of an error. Offers termination after successful create.
+- `-i , --config_input (required)` Path to YAML configurations file. Relative paths can be used and start at `~/.config/bibigrid`
+- `-cid , --cluster_id ` Cluster id is needed for ide and termination. If no cluster id is set, the last started cluster's id will be used (except for `list_clusters`).
+## Mutually exclusive actions: choose exactly one
+- `-V, --version` Displays version.
+- `-t, --terminate_cluster` Terminates cluster. Needs cluster-id set.
+- `-c, --create` Creates cluster.
+- `-l, --list_clusters` Lists all running clusters. If cluster-id is
+ set, will list this cluster in detail only.
+- `-ch, --check` Validates cluster configuration.
+- `-ide, --ide` Establishes a secured connection to ide.
+ Needs cluster-id set.
+- `-u, --update` Updates master's playbook. Needs cluster-id set, no job running and no workers powered up.
\ No newline at end of file
diff --git a/documentation/markdown/features/check.md b/documentation/markdown/features/check.md
new file mode 100644
index 000000000..c92c8a814
--- /dev/null
+++ b/documentation/markdown/features/check.md
@@ -0,0 +1 @@
+# Check
\ No newline at end of file
diff --git a/documentation/markdown/features/cloud_specification_data.md b/documentation/markdown/features/cloud_specification_data.md
new file mode 100644
index 000000000..c70a776db
--- /dev/null
+++ b/documentation/markdown/features/cloud_specification_data.md
@@ -0,0 +1,76 @@
+# Cloud Specification Data
+To access the cloud, authentication information is required. The BiBiGrid2 no longer uses environment variables, but a two file system instead.
+`clouds.yaml` and `clouds-public.yaml` can be placed in `~/.config/bibigrid/` or `/etc/bibigrid/` and will be loaded by BiBiGrid2 on execution.
+While you store your password and username in `clouds.yaml` (private), you can store all other information ready to share in `clouds-public.yaml` (shareable).
+However, all information can just be stored in `clouds.yaml`.
+
+Keys set in `clouds.yaml` will overwrite keys from `clouds-public.yaml`.
+
+## Openstack
+Be aware that the downloaded `clouds.yaml` file contains all information.
+OpenStack does not split information into `clouds.yaml` and `clouds-public.yaml` on its own.
+The example files show an example split.
+
+### Password Example
+Using the password `clouds.yaml` is easy. However, since passwords - unlike [Application Credentials](#application-credentials-example)
+don't have an expiration date, caution is advised.
+
+![Download](../../images/features/cloud_specification_data/pw_screen1.png)
+
+Move the downloaded file to `~/.config/bibigrid/` or `/etc/bibigrid/`.
+
+##### Password clouds.yaml
+```yaml
+clouds:
+ openstack:
+ profile: nameOfCloudsPublicYamlEntry
+ auth:
+ username: SamSampleman
+ password: SecurePassword
+```
+
+##### Password clouds-public.yaml
+```yaml
+public-clouds:
+ nameOfCloudsPublicYamlEntry:
+ auth:
+ auth_url: https://somelink:someport
+ project_id: someProjectId
+ project_name: someProjectName
+ user_domain_name: someDomainName
+ region_name: someRegionName
+ interface: "public"
+ identity_api_version: 3
+```
+### Application Credentials Example
+The following show, how an Application Credential can be created and the related `clouds.yaml` downloaded.
+Application Credentials are the preferred way of authentication since they do have an expiration date and
+their access can be limited.
+
+![Navigation](../../images/features/cloud_specification_data/ac_screen1.png)
+![Creation](../../images/features/cloud_specification_data/ac_screen2.png)
+![Download](../../images/features/cloud_specification_data/ac_screen3.png)
+
+Move the downloaded file to `~/.config/bibigrid/` or `/etc/bibigrid/`.
+
+#### Application Credential clouds.yaml
+```yaml
+clouds:
+ openstack:
+ profile: nameOfCloudsPublicYamlEntry
+ auth:
+ application_credential_id: SomeID
+ application_credential_secret: SecureSecret
+```
+
+#### Application Credential clouds-public.yaml
+```yaml
+public-clouds:
+ nameOfCloudsPublicYamlEntry:
+ auth:
+ auth_url: https://somelink:someport
+ region_name: SomeRegion
+ interface: "public"
+ identity_api_version: 3
+ auth_type: "v3applicationcredential"
+```
\ No newline at end of file
diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md
new file mode 100644
index 000000000..3e0309323
--- /dev/null
+++ b/documentation/markdown/features/configuration.md
@@ -0,0 +1,200 @@
+# Configuration
+
+The configuration file (often called `bibigrid.yml`) contains important information about cluster creation.
+The cluster configuration holds a list of configurations where each configuration is assigned to a specific provider
+(location). That allows cluster to stretch over multiple providers. The configuration file is best stored in
+`~/.config/bibigrid/` since BiBiGrid starts its relative search there.
+
+## Configuration List
+The first configuration is always the master's provider configuration.
+Only the first configuration is allowed to have a master key.
+Every following configuration describes a provider that is not the master's provider containing a number of worker and a
+vpnwkr (vpn worker). The vpnwkr is a worker with a floating IP. That allows the master - that knows all vpnwkrs to access
+all workers using the floating IP as an entry point into the other local networks. However, all that will be covered by
+an abstraction layer using a virtual network. Therefore, end users can work on a spread cluster without noticing it.
+
+### Master Provider Configuration
+As mentioned before, the first configuration has a master key. Apart from that it also holds all information that is -
+simply put - true over the entire cluster. We also call those keys global.
+Keys that belong only to a single provider configuration are called local.
+For example whether the master works alongside the workers is a general fact.
+Therefore, it is stored within the first configuration. The master provider configuration.
+
+## Keys
+
+### Global
+
+#### sshPublicKeyFiles (optional)
+`sshPublicKeyFiles` expects a list of public keyfiles to be registered on every node. That allows you to grant access to
+created clusters to the owners of the private keyfile. For example, you can add colleges public key to the list and allow
+him to access your started cluster later on to debug it.
+
+#### masterMounts (optional)
+`masterMounts` expects a list of volumes or snapshots that will then be mounted to the master. If any snapshots are
+given, the related volumes are first created and then those volumes are used by BiBiGrid. Those volumes are not deleted
+after Cluster termination.
+
+
+
+ What is mounting?
+
+
+[Mounting](https://man7.org/linux/man-pages/man8/mount.8.html) adds a new filesystem to the file tree allowing access.
+
+
+
+
+
+#### nfsShares (optional)
+`nfsShares` expects a list of folder paths to share using nfs. In every case, `/vol/spool/` is always an nfsShare.
+This key only makes sense if the [nfs key](#nfs) is set `True`.
+
+
+
+What is NFS?
+
+
+NFS (Network File System) is a stable and well-functioning network protocol for exchanging files over the local network.
+
+
+#### ansibleRoles (optional)
+Yet to be explained.
+```
+ - file: SomeFile
+ hosts: SomeHosts
+ name: SomeName
+ vars: SomeVars
+ vars_file: SomeVarsFile
+```
+#### ansibleGalaxyRoles (optional)
+Yet to be explained.
+```
+ - hosts: SomeHost
+ name: SomeName
+ galaxy: SomeGalaxy
+ git: SomeGit
+ url: SomeURL
+ vars: SomeVars
+ vars_file: SomeVarsFile
+```
+
+#### localFS (optional)
+This key helps some users to create a filesystem to their liking. It is not used in general.
+
+#### localDNSlookup (optional)
+If `True`, master will store the link to his workers. This is called
+[Local DNS Lookup](https://helpdeskgeek.com/networking/edit-hosts-file/).
+
+#### zabbix (optional)
+If `True`, the monitoring solution [zabbix](https://www.zabbix.com/) will be installed on the master.
+
+#### nfs (optional)
+If `True`, nfs is created.
+
+
+
+What is NFS?
+
+
+NFS (Network File System) is a stable and well-functioning network protocol for exchanging files over the local network.
+
+
+#### useMasterAsCompute (optional)
+Default the master always works together with the workers on submitted jobs. If you set `useMasterWithPublicIp`
+ to `False` the master will instead no longer support the workers.
+
+#### waitForServices (optional):
+Expects a list of services to wait for. This is required if your provider has any post-launch services. If not set,
+seemingly random errors can occur when the service interrupts the ansible execution. Providers and their services are
+listed on [de.NBI Wiki](https://cloud.denbi.de/wiki/) at `Computer Center Specific`.
+
+### Local
+
+#### infrastructure (required)
+`infrastructure` sets the used provider implementation for this configuration. Currently only `openstack` is available.
+Other infrastructures would be AWS and so on.
+
+#### cloud
+`cloud` decides which entry in the `clouds.yaml` is used.
+When using OpenStack the downloaded `clouds.yaml` is named `openstack`
+
+`cloud: openstack`
+
+#### workerInstances (optional)
+`workerInstances` expects a list of workers to be used on this specific provider the configuration is for.
+`Instances` are also called `servers`.
+
+```
+workerInstance:
+ - type: de.NBI tiny
+ image: Ubuntu 22.04 LTS (2022-10-14)
+ count: 2
+```
+- `type` sets the instance's hardware configuration. Also called `flavor` sometimes.
+- `image` sets the bootable operating system to be installed on the instance.
+- `count` sets how many workers of that `type` `image` combination are to be used by the cluster
+
+Find your active `images`:
+
+```
+openstack image list --os-cloud=openstack | grep active
+```
+
+Find your active `flavors`:
+
+```
+openstack flavor list --os-cloud=openstack
+```
+
+#### Master or vpnWorker?
+
+##### Master
+Only in the first configuration and only one:
+```
+ masterInstance:
+ type: de.NBI tiny
+ image: Ubuntu 22.04 LTS (2022-10-14)
+```
+
+##### vpnWorker:
+Exactly once in every configuration but the first:
+```
+ vpnWorker:
+ type: de.NBI tiny
+ image: Ubuntu 22.04 LTS (2022-10-14)
+```
+
+#### sshUser (required)
+`sshUser` is the standard user of the installed images. For `Ubuntu 22.04` this would be `ubuntu`.
+
+#### region (required)
+Every [region](https://docs.openstack.org/python-openstackclient/rocky/cli/command-objects/region.html) has its own
+openstack deployment. Every [avilability zone](#availabilityzone-required) belongs to a region.
+
+Find your `regions`:
+```
+openstack region list --os-cloud=openstack
+```
+
+
+#### availabilityZone (required)
+[availability zones](https://docs.openstack.org/nova/latest/admin/availability-zones.html) allow to logically group
+nodes.
+
+Find your `availabilityZones`:
+```
+openstack region list --os-cloud=openstack
+```
+
+#### subnet (required)
+`subnet` is a block of ip addresses.
+
+Find available `subnets`:
+
+```
+openstack subnet list --os-cloud=openstack
+```
+
+#### localDNSLookup (optional)
+If no full DNS service for started instances is available, set `localDNSLookup: True`.
+Currently the case in Berlin, DKFZ, Heidelberg and Tuebingen.
\ No newline at end of file
diff --git a/documentation/markdown/features/create.md b/documentation/markdown/features/create.md
new file mode 100644
index 000000000..6efe52f02
--- /dev/null
+++ b/documentation/markdown/features/create.md
@@ -0,0 +1,2 @@
+# Create
+Temporary cluster keys will be stored in `~/.config/bibigrid/keys`.
\ No newline at end of file
diff --git a/documentation/markdown/features/ide.md b/documentation/markdown/features/ide.md
new file mode 100644
index 000000000..6093e7468
--- /dev/null
+++ b/documentation/markdown/features/ide.md
@@ -0,0 +1,2 @@
+# Web IDE
+
diff --git a/documentation/markdown/features/list_clusters.md b/documentation/markdown/features/list_clusters.md
new file mode 100644
index 000000000..0f8321173
--- /dev/null
+++ b/documentation/markdown/features/list_clusters.md
@@ -0,0 +1 @@
+# List Clusters
\ No newline at end of file
diff --git a/documentation/markdown/features/terminate_cluster.md b/documentation/markdown/features/terminate_cluster.md
new file mode 100644
index 000000000..a47eb2894
--- /dev/null
+++ b/documentation/markdown/features/terminate_cluster.md
@@ -0,0 +1 @@
+# Terminate Cluster
\ No newline at end of file
diff --git a/documentation/markdown/features/update.md b/documentation/markdown/features/update.md
new file mode 100644
index 000000000..3e9ff9ecf
--- /dev/null
+++ b/documentation/markdown/features/update.md
@@ -0,0 +1 @@
+# Update
\ No newline at end of file
diff --git a/documentation/markdown/features/version.md b/documentation/markdown/features/version.md
new file mode 100644
index 000000000..e04a043bb
--- /dev/null
+++ b/documentation/markdown/features/version.md
@@ -0,0 +1 @@
+# Version
\ No newline at end of file
diff --git a/documentation/markdown/software/ansible.md b/documentation/markdown/software/ansible.md
new file mode 100644
index 000000000..f7e02ac8a
--- /dev/null
+++ b/documentation/markdown/software/ansible.md
@@ -0,0 +1,39 @@
+# Ansible
+
+## Ansible Tutorial
+- [Ansible Workshop Presentation](https://docs.google.com/presentation/d/1W4jVHLT8dB1VsdtxXqtKlMqGbeyEWTQvSHh0WMfWo2c/edit#slide=id.p10)
+- [de.NBI Cloud's Ansible Course](https://gitlab.ub.uni-bielefeld.de/denbi/ansible-course)
+
+## Executing BiBiGrid's Playbook Manually
+Only execute BiBiGrid's playbook manually when no worker is up. The playbook is executed automatically for workers powering up.
+
+If you've implemented changes to BiBiGrid's playbook, you might want to execute BiBiGrid's playbook manually to see how
+those changes play out. For this we need the preinstalled `bibigrid-playbook` command. However, BiBiGrid has a handy
+shortcut for that called `bibiplay`.
+
+### bibiplay
+To make things easier we wrote the [bibiplay](..%2F..%2F..%2Fresources%2Fbin%2Fbibiplay) wrapper. It's used like this:
+```sh
+bibiplay
+```
+is the same as:
+```sh
+ansible-playbook /opt/playbook/site.yml /opt/playbook/ansible_hosts/
+```
+any additional arguments are passed to `ansible-playbook`:
+```sh
+bibiplay -l master
+```
+is the same as:
+```sh
+ansible-playbook /opt/playbook/site.yml /opt/playbook/ansible_hosts/ -l master
+```
+
+### Useful commands
+For more options see [ansible-playbook's manpage](https://linux.die.net/man/1/ansible-playbook).
+
+
+| Summary | Command |
+|:----------------------------------------------------------------:|:-----------------------------:|
+| Prepare master manually | `bibiplay -l master` |
+| Prepare only slurm on master manually | `bibiplay -l master -t slurm` |
diff --git a/documentation/markdown/software/slurm.md b/documentation/markdown/software/slurm.md
new file mode 100644
index 000000000..a97a2b3f6
--- /dev/null
+++ b/documentation/markdown/software/slurm.md
@@ -0,0 +1,21 @@
+# Slurm
+Be aware that due to BiBiGrid's slurm configuration the default behavior of commands might differ slightly from slurm's defaults.
+Everything described below explains how slurm will behave in BiBiGrid's context.
+
+## Slurm Client
+### Useful commands
+For more options see [slurm client's manpage](https://manpages.debian.org/testing/slurm-client/slurm-wlm.1).
+
+| Summary | Command | Explanation & Comment |
+|:----------------------:|:----------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
+| List all present nodes | `sinfo` | Cloud nodes that are powered down are marked`~`. Knowing [Node State Codes](https://manpages.debian.org/testing/slurm-client/sinfo.1.en.html#NODE_STATE_CODES) helps a lot. |
+| Shutdown an instance | `sudo scontrol update NodeName=[node-name] state=POWER_DOWN reason=[reason]` | Powers down the node. The instance will be deleted. |
+| Powerup an instance | `sudo scontrol update NodeName=[node-name] state=POWER_UP reason=[reason]` | Powers up the node. An instance will be created. |
+| Lists all running jobs | `squeue` | Allows you to see whether everything runs as expected. |
+
+### Read more
+
+| Summary | Explanation |
+|:---------------------------------------------------------------------------------:|:--------------------------------------------:|
+| [NODE STATE CODES](https://slurm.schedmd.com/sinfo.html#SECTION_NODE-STATE-CODES) | Very helpful to interpret `sinfo` correctly. |
+
diff --git a/documentation/markdown/software/theia_ide.md b/documentation/markdown/software/theia_ide.md
new file mode 100644
index 000000000..92e9123b6
--- /dev/null
+++ b/documentation/markdown/software/theia_ide.md
@@ -0,0 +1,6 @@
+# Theia IDE
+[Theia Web IDE's](https://www.theia-ide.org/) many features make it easier to work on your cloud instances.
+
+![Theia](../../images/software/theia/theia.png)
+## Installing Python Syntax Highlighter
+
diff --git a/documentation/markdown/software/zabbix.md b/documentation/markdown/software/zabbix.md
new file mode 100644
index 000000000..23df10393
--- /dev/null
+++ b/documentation/markdown/software/zabbix.md
@@ -0,0 +1,3 @@
+# Zabbix
+
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..2cbce46da
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,58 @@
+appdirs==1.4.4
+attrs==22.1.0
+autopage==0.5.1
+bcrypt==4.0.1
+certifi==2022.9.24
+cffi==1.15.1
+charset-normalizer==2.1.1
+cliff==4.1.0
+cmd2==2.4.2
+cryptography==38.0.4
+debtcollector==2.5.0
+decorator==5.1.1
+dogpile.cache==1.1.8
+idna==3.4
+importlib-metadata==5.1.0
+iso8601==1.1.0
+jmespath==1.0.1
+jsonpatch==1.32
+jsonpointer==2.3
+keystoneauth1==5.1.0
+mergedeep==1.3.4
+msgpack==1.0.4
+munch==2.5.0
+netaddr==0.8.0
+netifaces==0.11.0
+openstacksdk==0.103.0
+os-service-types==1.7.0
+osc-lib==2.6.2
+oslo.config==9.0.0
+oslo.i18n==5.1.0
+oslo.serialization==5.0.0
+oslo.utils==6.1.0
+packaging==21.3
+paramiko==2.12.0
+pbr==5.11.0
+prettytable==3.5.0
+pycparser==2.21
+PyNaCl==1.5.0
+pyparsing==3.0.9
+pyperclip==1.8.2
+python-cinderclient==9.1.0
+python-keystoneclient==5.0.1
+python-novaclient==18.2.0
+python-openstackclient==6.0.0
+pytz==2022.6
+PyYAML==6.0
+requests==2.28.1
+requestsexceptions==1.4.0
+rfc3986==2.0.0
+shortuuid==1.0.11
+simplejson==3.18.0
+six==1.16.0
+sshtunnel==0.4.0
+stevedore==4.1.1
+urllib3==1.26.13
+wcwidth==0.2.5
+wrapt==1.14.1
+zipp==3.11.0
diff --git a/resources/bin/bibigrid-hello-world.sh b/resources/bin/bibigrid-hello-world.sh
new file mode 100755
index 000000000..4cd7c728e
--- /dev/null
+++ b/resources/bin/bibigrid-hello-world.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+exe() { echo "\$" "$@" ; "$@" ; }
+
+echo "Hello, World! This program will show very basic slurm scheduling."
+echo "I) Only execute this just after logging in and without any prior changes"
+echo "II) You need to have at least one worker in your configuration or this program will hang at some point."
+echo "III) The master should be configured to work as well or this program will hang at some point."
+read -n 1 -r -s -p $'Press enter to continue...\n'
+echo "Let's see which servers are up using sinfo (slurm info)!"
+exe sinfo
+echo -e "\nOnly the master is up, since all other workers are configured, but not powered up ('~' is used for nodes that are powered down)."
+echo "See here for more info about node states: https://slurm.schedmd.com/sinfo.html#SECTION_NODE-STATE-CODES"
+read -n 1 -r -s -p $'Press enter to continue...\n'
+echo -e "\nLet's execute the 'hostname' command:"
+exe srun hostname
+echo -e "\nAnd see if a server started"
+exe sinfo
+echo -e "\nSince the master is a worker, too, no need to start new workers."
+read -n 1 -r -s -p $'Press enter to continue...\n'
+echo -e "\nWhat if we need another server? Let's exclude $(hostname) for now using (-x node-name-to-exclude), so slurm has to power up a worker node."
+echo "While it starts, open another terminal and execute 'squeue'. That will show you the running job."
+echo "Also execute 'sinfo' that will show you the node is powering up ('#' is used for nodes that are powering up). But now let's start another node:"
+start_time=$(date +%T)
+exe srun -x "$(hostname)" hostname
+echo "We triggered the power up at: $(date +%T). Now it's $start_time."
+echo -e "\nLet's see what changed."
+exe sinfo
+echo "Now a worker powered up as we can see looking at 'sinfo'"
+read -n 1 -r -s -p $'Press enter to continue...\n'
+echo -e "\nWorkers that are not used will be shut down after a while."
diff --git a/resources/bin/bibiplay b/resources/bin/bibiplay
new file mode 100644
index 000000000..6b4d6148f
--- /dev/null
+++ b/resources/bin/bibiplay
@@ -0,0 +1,3 @@
+#!/bin/bash
+# allows for an easier execution of the ansible playbook no matter where you are
+ansible-playbook /opt/playbook/site.yml -i /opt/playbook/ansible_hosts "$@"
\ No newline at end of file
diff --git a/resources/playbook/ansible.cfg b/resources/playbook/ansible.cfg
new file mode 100644
index 000000000..ee5363109
--- /dev/null
+++ b/resources/playbook/ansible.cfg
@@ -0,0 +1,10 @@
+# This file is moved programmatically to /etc/ansible/ansible.cfg on the master so it shouldn't be moved manually
+[defaults]
+inventory = ./ansible_hosts
+host_key_checking = False
+forks=50
+pipelining = True
+log_path=~/ansible.log
+timeout = 60
+[ssh_connection]
+ssh_args = -o ControlMaster=auto -o ControlPersist=60s
\ No newline at end of file
diff --git a/resources/playbook/roles/additional/example/meta/main.yml b/resources/playbook/roles/additional/example/meta/main.yml
new file mode 100644
index 000000000..8ff216df2
--- /dev/null
+++ b/resources/playbook/roles/additional/example/meta/main.yml
@@ -0,0 +1,28 @@
+galaxy_info:
+ role_name: Hello-World Example
+ author: Tim Dilger
+ description: Shows working example of installing Ansible Role.
+ company: Bielefeld university, CeBiTec, BiBiServ
+
+ license: BSD
+
+ min_ansible_version: 2.7
+
+ platforms:
+ - name: EL
+ versions:
+ - 7
+ - name: Debian
+ versions:
+ - stretch
+ - name: Ubuntu
+ versions:
+ - xenial
+ - bionic
+
+ galaxy_tags:
+ - hello-world
+
+dependencies: []
+ # List your role dependencies here, one per line. Be sure to remove the '[]' above,
+ # if you add dependencies to this list.
diff --git a/resources/playbook/roles/additional/example/tasks/main.yml b/resources/playbook/roles/additional/example/tasks/main.yml
new file mode 100644
index 000000000..63ea8e434
--- /dev/null
+++ b/resources/playbook/roles/additional/example/tasks/main.yml
@@ -0,0 +1,3 @@
+- debug:
+ msg:
+ - "Hello {{ ansible_user }}!"
diff --git a/resources/playbook/roles/bibigrid/defaults/main.yml b/resources/playbook/roles/bibigrid/defaults/main.yml
new file mode 100644
index 000000000..2702e6fbb
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/defaults/main.yml
@@ -0,0 +1,6 @@
+nvm_install_dir: /opt/nvm
+
+theia_version: "next"
+theia_ide_install_dir: /opt/theia-ide
+theia_ide_bind_address: localhost
+theia_ide_bind_port: 8181
diff --git a/resources/playbook/roles/bibigrid/files/disable-auto-upgrades.conf b/resources/playbook/roles/bibigrid/files/disable-auto-upgrades.conf
new file mode 100644
index 000000000..8717231ce
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/disable-auto-upgrades.conf
@@ -0,0 +1,4 @@
+APT::Periodic::Update-Package-Lists "0";
+APT::Periodic::Download-Upgradeable-Packages "0";
+APT::Periodic::AutocleanInterval "0";
+APT::Periodic::Unattended-Upgrade "0";
\ No newline at end of file
diff --git a/resources/playbook/roles/bibigrid/files/slurm/cgroup.conf b/resources/playbook/roles/bibigrid/files/slurm/cgroup.conf
new file mode 100644
index 000000000..2b8ba9c6a
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/cgroup.conf
@@ -0,0 +1,14 @@
+CgroupMountpoint="/sys/fs/cgroup"
+CgroupAutomount=yes
+CgroupReleaseAgentDir="/etc/slurm/cgroup"
+AllowedDevicesFile="/etc/slurm/cgroup_allowed_devices_file.conf"
+ConstrainCores=no
+TaskAffinity=no
+ConstrainRAMSpace=yes
+ConstrainSwapSpace=no
+ConstrainDevices=no
+AllowedRamSpace=100
+AllowedSwapSpace=0
+MaxRAMPercent=100
+MaxSwapPercent=100
+MinRAMSpace=30
\ No newline at end of file
diff --git a/resources/playbook/roles/bibigrid/files/slurm/cgroup_allowed_devices_file.conf b/resources/playbook/roles/bibigrid/files/slurm/cgroup_allowed_devices_file.conf
new file mode 100644
index 000000000..471ad8cfd
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/cgroup_allowed_devices_file.conf
@@ -0,0 +1,7 @@
+/dev/null
+/dev/urandom
+/dev/zero
+/dev/sd*
+/dev/vd*
+/dev/cpu/*/*
+/dev/pts/*
\ No newline at end of file
diff --git a/resources/playbook/roles/bibigrid/files/slurm/create.sh b/resources/playbook/roles/bibigrid/files/slurm/create.sh
new file mode 100644
index 000000000..98df3be7f
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/create.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# redirect stderr and stdout
+exec >> /var/log/slurm/create.out.log
+exec 2>> /var/log/slurm/create.err.log
+
+
+hosts=$(scontrol show hostnames "$1")
+
+
+# create and configure requested instances
+python3 /usr/local/bin/create_server.py "${hosts}"
+exit $?
\ No newline at end of file
diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py
new file mode 100644
index 000000000..53e79005a
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""
+Creates one or more instances from comma separated name list.
+Is called automatically by create.sh (called by slurm user automatically) which sources a virtual environment.
+"""
+import logging
+import math
+from openstack.exceptions import OpenStackCloudException
+import re
+import sys
+import time
+
+import ansible_runner
+import os_client_config
+import paramiko
+import yaml
+
+LOGGER_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
+logging.basicConfig(format=LOGGER_FORMAT, filename="/var/log/slurm/create_server.log", level=logging.INFO)
+
+logging.info("create_server.py started")
+start_time = time.time()
+
+
+def check_ssh_active(private_ip, private_key="/opt/slurm/.ssh/id_ecdsa", username="ubuntu", timeout=5):
+ """
+ Waits until SSH connects successful. This guarantees that the node can be reached via Ansible.
+ @param private_ip: ip of node
+ @param private_key: private ssh key
+ @param username: username of node
+ @param timeout: how long to try
+ @return:
+ """
+ # Wait for SSH Connection available
+ paramiko_key = paramiko.ECDSAKey.from_private_key_file(private_key)
+ with paramiko.SSHClient() as client:
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ attempts = 0
+ establishing_connection = True
+ while establishing_connection:
+ try:
+ client.connect(hostname=private_ip, username=username, pkey=paramiko_key)
+ establishing_connection = False
+ except paramiko.ssh_exception.NoValidConnectionsError as exc:
+ logging.info("Attempting to connect to %s... This might take a while", private_ip)
+ if attempts < timeout:
+ time.sleep(2 ** attempts)
+ attempts += 1
+ else:
+ logging.warning("Attempt to connect to %s failed.", private_ip)
+ raise ConnectionError from exc
+
+
+def run_playbook(run_instances):
+ """
+ Runs the BiBiGrid playbook for run_instances
+ @param run_instances: instances to run the playbook for
+ @return:
+ """
+ logging.info("run_playbook with \ninstances: %s", run_instances)
+
+ # cmdline_args = ["/opt/playbook/site.yml", '-i', '/opt/playbook/ansible_hosts', '-vvvv', '-l', instances]
+ cmdline_args = ["/opt/playbook/site.yml", '-i', '/opt/playbook/ansible_hosts', '-l', ",".join(instances)]
+ executable_cmd = '/usr/local/bin/ansible-playbook'
+ logging.info(f"run_command...\nexecutable_cmd: {executable_cmd}\ncmdline_args: {cmdline_args}")
+
+ runner = ansible_runner.interface.init_command_config(
+ executable_cmd=executable_cmd,
+ cmdline_args=cmdline_args)
+
+ runner.run()
+ runner_response = runner.stdout.read()
+ runner_error = runner.stderr.read()
+ return runner, runner_response, runner_error, runner.rc
+
+
+if len(sys.argv) < 2:
+ logging.warning("usage: $0 instance1_name[,instance2_name,...]")
+ logging.info("Your input % with length %s", sys.argv, len(sys.argv))
+ sys.exit(1)
+
+sdk = os_client_config.make_sdk(cloud="master")
+
+# read instances configuration
+with open("/opt/playbook/vars/instances.yml", mode="r") as f:
+ worker_types = yaml.safe_load(f)
+
+# read common configuration
+with open("/opt/playbook/vars/common_configuration.yml", mode="r") as f:
+ common_config = yaml.safe_load(f)
+
+instances = sys.argv[1].split("\n")
+logging.info("Instances: %s", instances)
+
+server_list = []
+openstack_exception_list = []
+# Iterate over all names and search for a fitting ...
+for worker in instances:
+ # ... worker_type
+ for worker_type in worker_types["workers"]:
+ if re.match(worker_type["regexp"], worker):
+ try:
+ logging.info("Create server %s.", worker)
+ # create server and ...
+ server = sdk.create_server(
+ name=worker,
+ flavor=worker_type["flavor"]["name"],
+ image=worker_type["image"],
+ network=worker_type["network"],
+ key_name=f"tempKey_bibi-{common_config['cluster_id']}",
+ wait=False)
+ # ... add it to server
+ server_list.append(server)
+ # ToDo Better handling, Check edge cases, ...
+ except OpenStackCloudException as exc:
+ logging.warning("While creating %s the OpenStackCloudException %s occurred. Worker ignored.",
+ worker, exc)
+ openstack_exception_list.append(worker)
+
+# ToDo implement better error handling
+no_ssh_list = []
+return_list = []
+openstack_wait_exception_list = []
+for server in server_list:
+ try:
+ sdk.wait_for_server(server, auto_ip=False, timeout=600)
+ server = sdk.get_server(server["id"])
+ except OpenStackCloudException as exc:
+ logging.warning("While creating %s the OpenStackCloudException %s occurred.", worker, exc)
+ openstack_wait_exception_list.append(server.name)
+ continue
+ logging.info("%s is active. Checking ssh", server.name)
+ try:
+ check_ssh_active(server.private_v4)
+ logging.info(f"Server {server.name} is {server.status}.")
+ return_list.append(server.name)
+ except ConnectionError as exc:
+ logging.warning(f"{exc}: Couldn't connect to {server.name}.")
+ no_ssh_list.append(server.name)
+
+# If no suitable server can be started: abort
+if len(return_list) == 0:
+ logging.warning("No suitable server found! Abort!")
+ exit(1)
+
+logging.info("Call Ansible to configure instances.")
+# run ansible
+# ToDo: use https://ansible-runner.readthedocs.io/en/latest/ instead of subprocess
+runnable_instances = ",".join(return_list)
+
+r, response, error, rc = run_playbook(runnable_instances)
+logging.info("Ansible executed!")
+unreachable_list = list(r.stats["dark"].keys())
+failed_list = list(r.stats["failures"].keys())
+overall_failed_list = unreachable_list + failed_list + no_ssh_list + openstack_wait_exception_list
+if overall_failed_list or openstack_exception_list:
+ logging.warning(f"Openstack exception list: {openstack_exception_list}")
+ logging.warning(f"Unable to connect via ssh list: {no_ssh_list}")
+ logging.warning(f"Unreachable list: {unreachable_list}")
+ logging.warning(f"Failed list: {failed_list}")
+ logging.warning(f"Return code: {rc}")
+ for server_name in overall_failed_list:
+ logging.warning(f"Deleting server {server_name}: {sdk.delete_server(server_name)}")
+ logging.warning("Exit Code 1")
+ exit(1)
+logging.info("Successful create_server.py execution!")
+time_in_s = time.time() - start_time
+logging.info(f"--- %s minutes and %s seconds ---", math.floor(time_in_s / 60), time_in_s % 60)
+logging.info("Exit Code 0")
+exit(0)
diff --git a/resources/playbook/roles/bibigrid/files/slurm/fail.sh b/resources/playbook/roles/bibigrid/files/slurm/fail.sh
new file mode 100644
index 000000000..b6fa68398
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/fail.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# redirect stderr and stdout
+exec >> /var/log/slurm/fail.out.log
+exec 2>> /var/log/slurm/fail.err.log
+
+# $1 is in slurm node format for example: bibigrid-worker0-cid-[0-1],bibigrid-worker1-cid-0 and needs no converting
+scontrol update NodeName="$1" state=RESUME reason=FailedStartup # no sudo needed cause executed by slurm user
+
+exit $?
diff --git a/resources/playbook/roles/bibigrid/files/slurm/requirements.txt b/resources/playbook/roles/bibigrid/files/slurm/requirements.txt
new file mode 100644
index 000000000..a0a23f56f
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/requirements.txt
@@ -0,0 +1,5 @@
+python-openstackclient==6.0.0
+ipython
+os_client_config
+paramiko
+ansible-runner
\ No newline at end of file
diff --git a/resources/playbook/roles/bibigrid/files/slurm/slurmrestd_default b/resources/playbook/roles/bibigrid/files/slurm/slurmrestd_default
new file mode 100644
index 000000000..b6d2fd860
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/slurmrestd_default
@@ -0,0 +1,9 @@
+# /etc/default/slurmrestd
+# Additional options that are passed to the slurmrestd daemon
+#SLURMRESTD_OPTIONS=""
+SLURM_CONF="/etc/slurm/slurm.conf"
+#SLURMRESTD_DEBUG="8"
+SLURM_JWT=""
+SLURMRESTD_LISTEN=":6820"
+SLURMRESTD_AUTH_TYPES="rest_auth/jwt"
+SLURMRESTD_OPENAPI_PLUGINS="openapi/v0.0.36"
\ No newline at end of file
diff --git a/resources/playbook/roles/bibigrid/files/slurm/slurmrestd_override.conf b/resources/playbook/roles/bibigrid/files/slurm/slurmrestd_override.conf
new file mode 100644
index 000000000..eebbe66f7
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/slurmrestd_override.conf
@@ -0,0 +1,6 @@
+# Override systemd service ExecStart command to disable unixSocket of slurmrestd
+[Unit]
+After=slurmdbd.service
+[Service]
+ExecStart=
+ExecStart=/usr/sbin/slurmrestd $SLURMRESTD_OPTIONS
\ No newline at end of file
diff --git a/resources/playbook/roles/bibigrid/files/slurm/terminate.sh b/resources/playbook/roles/bibigrid/files/slurm/terminate.sh
new file mode 100644
index 000000000..ba6e6232a
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/slurm/terminate.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# redirect stderr and stdout
+exec >> /var/log/slurm/terminate.out.log
+exec 2>> /var/log/slurm/terminate.err.log
+
+function log {
+ echo "$(date) $*"
+}
+
+log "Terminate invoked $0 $*"
+# extract all hosts from argumentlist
+hosts=$(scontrol show hostnames "$1")
+for host in $hosts
+do
+ # ToDo: Implement better logging in case of an error
+ log "Delete instance ${host} from Zabbix host list."
+ python3 /usr/local/bin/zabbix_host_delete.py --pwd bibigrid "${host}"
+ log "Terminate instance ${host}"
+ openstack --os-cloud master server delete "${host}"
+ log "done"
+done
diff --git a/resources/playbook/roles/bibigrid/files/zabbix/index.html b/resources/playbook/roles/bibigrid/files/zabbix/index.html
new file mode 100644
index 000000000..076bd06a9
--- /dev/null
+++ b/resources/playbook/roles/bibigrid/files/zabbix/index.html
@@ -0,0 +1,40 @@
+
+
+
+ BiBiGrid Overview
+
+
+
+
+
+
+
+
+