refactor: Sorry (#143)

automl · Nov 13, 2023 · b4a6c48 · b4a6c48
1 parent d134a45
commit b4a6c48
Show file tree

Hide file tree

Showing 179 changed files with 9,642 additions and 13,380 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -3,6 +3,16 @@ on:
   # Disabled for now essentially
   workflow_dispatch:
 
+  #push:
+    #  branches:
+    #  - main
+    #tags:
+    #  - "*.*.*"
+
+  #pull_request:
+    #branches:
+    #  - main
+
 permissions:
   contents: write
 
@@ -18,5 +28,4 @@ jobs:
         with:
           key: ${{ github.ref }}
           path: .cache
-      - run: pip install "mkdocs-material" "mkdocs-autorefs" "mkdocstrings[python]"
-      - run: mkdocs gh-deploy --force
+      - run: python -m pip install -e ".[dev]"
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -14,10 +14,10 @@ jobs:
       with:
         submodules: recursive
 
-    - name: Setup Python 3.8
+    - name: Setup Python 3.10
       uses: actions/setup-python@v4
       with:
-        python-version: 3.8
+        python-version: 3.10
 
     - name: Install pre-commit
       run: |

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,20 +1,24 @@
 name: Tests
 
 on:
-  # Disabled for now essentially
   workflow_dispatch:
 
+  push:
+    branches:
+      - main
+    tags:
+      - "*.*.*"
+
+  pull_request:
+    branches:
+      - main
+
 env:
 
-  package-name: byop
+  package-name: amltk
   test-dir: tests
   extra-requires: "[dev]"  # "" for no extra_requires
 
-  # Arguments used for pytest
-  pytest-args: >-
-    -v
-    --log-level=DEBUG
-
 jobs:
   test:
 
@@ -28,7 +32,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
 
     steps:
@@ -48,4 +52,4 @@ jobs:
 
     - name: Tests
       run: |
-        pytest ${{ env.pytest-args }} ${{ env.test-dir }}
+        pytest ${{ env.test-dir }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ files: |
   )/.*\.py$
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
       - id: check-added-large-files
         files: ".*"
@@ -26,37 +26,32 @@ repos:
       - id: debug-statements
         files: '^src/.*\.py$'
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
+    rev: v1.7.0
     hooks:
       - id: mypy
         exclude: "test_comm_task"  # Pre-commit mypy hates this one, crashes on (l106)
         additional_dependencies:
-          - "attrs"
           - "types-pyyaml"
           - "types-psutil"
         args:
           - "--no-warn-return-any" # Disable this because it doesn't know about 3rd party imports
           - "--ignore-missing-imports"
           - "--show-traceback"
-  - repo: https://github.com/psf/black
-    rev: 23.7.0
-    hooks:
-      - id: black
-        args: ["--config=pyproject.toml"]
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: 0.23.3
+    rev: 0.27.1
     hooks:
       - id: check-github-workflows
         files: '^github/workflows/.*\.ya?ml$'
         types: ["yaml"]
       - id: check-dependabot
         files: '^\.github/dependabot\.ya?ml$'
   - repo: https://github.com/commitizen-tools/commitizen
-    rev: 3.5.3
+    rev: 3.12.0
     hooks:
       - id: commitizen
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.278
+    rev: v0.1.5
     hooks:
       - id: ruff
-        args: [--fix, --exit-non-zero-on-fix, --no-cache]
+        args: [--fix, --exit-non-zero-on-fix, --no-cache]
+      - id: ruff-format
diff --git a/docs/api_generator.py b/docs/api_generator.py
@@ -23,6 +23,9 @@
     if parts[-1] in ("__main__", "__version__", "__init__"):
         continue
 
+    if any(part.startswith("_") for part in parts):
+        continue
+
     nav[parts] = doc_path.as_posix()
 
     with mkdocs_gen_files.open(full_doc_path, "w") as fd:

diff --git a/docs/example_runner.py b/docs/example_runner.py
@@ -8,14 +8,15 @@
 from itertools import takewhile
 from pathlib import Path
 from typing import Any
+from typing_extensions import override
 
 import mkdocs_gen_files
 from more_itertools import first_true, peekable
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.WARNING)
 
-nav = mkdocs_gen_files.Nav()  # pyright: reportPrivateImportUsage=false
+nav = mkdocs_gen_files.Nav()  # pyright: ignore[reportPrivateImportUsage]=false
 
 ENV_VAR = "AMLTK_DOC_RENDER_EXAMPLES"
 
@@ -119,6 +120,7 @@ def code(self, code: list[str]) -> str:
         body = "\n".join(s)
         return body
 
+    @override
     def __str__(self) -> str:
         return self.code(self.lines)
 
@@ -127,6 +129,7 @@ def __str__(self) -> str:
 class CommentSegment:
     lines: list[str]
 
+    @override
     def __str__(self) -> str:
         return "\n".join(self.lines)
 
@@ -289,8 +292,5 @@ def copy_section(self) -> str:
     mkdocs_gen_files.set_edit_path(full_doc_path, path)
 
 lines = list(nav.build_literate_nav())
-with mkdocs_gen_files.open("examples/SUMMARY.md", "w") as nav_file:  #
+with mkdocs_gen_files.open("examples/index.md", "w") as nav_file:  #
     nav_file.writelines(lines)  #
-
-with mkdocs_gen_files.open("examples/index.md", "w") as index_file:
-    index_file.writelines(lines)  #
diff --git a/docs/guides/index.md b/docs/guides/index.md
@@ -3,76 +3,108 @@ of AutoML-Toolkit. Notably, we have three core concepts at the heart of
 AutoML-Toolkit, with supporting types and auxiliary functionality to
 enable these concepts.
 
-These take the form of a [`Task`][amltk.scheduling.Task], a [`Pipeline`][amltk.pipeline.Pipeline]
-and an [`Optimizer`][amltk.optimization.Optimizer] which combines the two
-to create the most flexible optimization framework we could imagine.
+These take the form of a **scheduling**, a **pipeline construction**
+and **optimization**. By combining these concepts, we provide an extensive
+framework from which to do AutoML research, utilize AutoML for you
+task or build brand new AutoML systems.
 
 ---
 
--   **Task**
+-   **Scheduling**
 
-    A `Task` is a function which we want to run _somewhere_, whether it be a local
-    process, on some node of a cluster or out in the cloud. Equipped with an
-    [`asyncio`][asyncio] **event-system** and a [`Scheduler`][amltk.scheduling.Scheduler]
-    to drive the gears of the system, we can provide a truly flexible and performant framework
-    upon to which to build an AutoML system.
+    Dealing with multiple processes and simultaneous compute,
+    can be both difficult in terms of understanding and utilization.
+    Often a prototype script just doesn't work when you need to run
+    larger experiments.
+
+    We provide an **event-driven system** with a flexible **backend**,
+    to help you write code that scales from just a few more cores on your machine
+    to utilizing an entire cluster.
+
+    This guide introduces `Task`s and the `Scheduler` in which they run, as well
+    as `@events` which you can subscribe callbacks to. Define what should run, when
+    it should run and simply define a callback to say what should happen when it's done.
+
+    This framework allows you to write code that simply scales, with as little
+    code change required as possible. Go from a single local process to an entire
+    cluster with the same script and 5 lines of code.
+
+    Checkout the [Scheduling guide!](./scheduling.md) for the full guide.
+    We also cover some of these topics in brief detail in the reference pages.
 
     !!! tip "Notable Features"
 
         * A system that allows incremental and encapsulated feature addition.
-        * An event-driven system with easy to use _callbacks_.
-        * Place constraints on your `Task`.
-        * Integrations for different backends for where to run your tasks.
+        * An [`@event`](site:reference/scheduling/events.md) system with easy to use _callbacks_.
+        * Place constraints and modify your [`Task`](site:reference/scheduling/task.md)
+            with [`Plugins`](site:reference/scheduling/plugins.md)
+        * Integrations for different [backends](site:reference/scheuling/executors.md) for where
+        to run your tasks.
         * A wide set of events to plug into.
-        * An easy to extend system to create your own specialized events and tasks.
-
-    Checkout the [Scheduling guide](./scheduling.md)
+        * An easy way to extend the functionality provided with your own set of domain or task
+            specific events.
 
 ---
 
--   **Pipeline**
+-   **Pipelines**
+
+    Optimizer require some _search space_ to optimize, yet provide no utility to actually
+    define these search space. When scaling beyond a simple single model, these search space
+    become harder to define, difficult to extend and are often disjoint from the actual pipeline
+    creation. When you want to create search spaces that can have choices between models, parametrized
+    pre-processing and a method to quickly change these setups, it can often feel tedious
+    and error-prone
+
+    By piecing together `Node`s of a pipeline, utilizing a set of different building blocks such
+    as a `Component`, `Sequential`, `Choice`es and more, you can abstractly define your entire pipeline.
+    Once you're done, we'll stitch together the entire `search_space()`, allow you to
+    easily `configure()` it and finally `build()` it into a concrete object you can use,
+    all in the same place.
 
-    A [`Pipeline`][amltk.pipeline.Pipeline] is a definition,
-    defining what your **pipeline** will do and how
-    it can be parametrized. By piecing together [`steps`][amltk.pipeline.api.step],
-    [`choices`][amltk.pipeline.api.choice] and [`splits`][amltk.pipeline.api.split], you can
-    say how your pipeline should look and how it's parametrized. We'll take care
-    of creating the search space to optimize over, configuring it and finally assembling
-    it into something you can actually use.
+    Checkout the [Pipeline guide!](./pipelines.md)
+    We also cover some of these topics in brief detail in the reference pages.
 
     !!! tip "Notable Features"
 
-        * An easy to edit pipeline structure, allowing for rapid addition, deletion and
+        * An easy, declaritive pipeline structure, allowing for rapid addition, deletion and
           modification during experimentation.
         * A flexible pipeline capable of handling complex structures and subpipelines.
-        * Easily attachable modules for things close to your pipeline but not a direct
-          part of the main structure.
+        * Mutliple component types to help you define your pipeline.
         * Exporting of pipelines into concrete implementations like an [sklearn.pipeline.Pipeline][]
           for use in your downstream tasks.
+        * Extensible to add your own component types and `builder=`s to use.
 
-    Checkout the [Pipeline guide](./pipelines.md)
 
 ---
 
--   **Optimizer**
+-   **Optimization**
 
-    An [`Optimizer`][amltk.optimization.Optimizer] is the capstone of the preceding two
-    fundamental systems. By leveraging an _"ask-and-tell"_ interface, we put you back
-    in control of how your system interacts with the optimizer. You run what you want,
-    wherever you want, telling the optimizer what you want and you storing what you want,
-    wherever you want.
-    This makes leveraging different optimizers easier than ever. By capturing the high-level
-    core loop of black box optimization into a simple [`Trial`][amltk.optimization.Trial] and
-    a [`Report`][amltk.optimization.Trial.Report], integrating your own optimizer is easy and
-    provides the entire system that AutoML-Toolkit offers with little cost.
+    An optimizer is the backbone behind many AutoML systems and the quickest way
+    to improve the performance of your current pipelines. However optimizer's vary
+    in terms of how they expect you to write code, they vary in how much control they
+    take of your code and can be quite difficult to interact with other than
+    their `run()` function.
+
+    By setting a simple expectation on an `Optimizer`, e.g. that it should have
+    an `ask()` and `tell()`, you are placed get back in terms of defining the loop,
+    define what happens, when and you can store what you'd like to record and put it
+    where you'd like to put it.
+
+    By unifying their suggestions as a `Trial` and a convenient `Report` to hand back
+    to them, you can switch between optimizers with minimal changes required. We have
+    added a load of utility to the `Trial`'s, such that you can easily profile sections,
+    add extra summary information, store artifacts and export DataFrames.
+
+    Checkout the [Optimization guide](./optimization.md). We recommend reading the previous
+    two guides to fully understand the possibilities with optimization.
+    We also cover some of these topics in brief detail in the reference pages.
 
     !!! tip "Notable Features"
 
         * An assortment of different optimizers for you to swap in an out with relative ease
         through a unified interface.
+        * A suite of utilities to help you record that data you want from your HPO experiments.
         * Full control of how you interact with it, allowing for easy warm-starting, complex
         swapping mechanisms or custom stopping criterion.
         * A simple interface to integrate in your own optimizer.
 
-    Checkout the [Optimization guide](./optimization.md). We recommend reading the previous
-    two guides to fully understand the possibilities with optimization.