Skip to content

Nightly -- Full Network Tests #670

Nightly -- Full Network Tests

Nightly -- Full Network Tests #670

Workflow file for this run

name: Nightly -- Full Network Tests
on:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:
env:
CARGO_INCREMENTAL: 0 # bookkeeping for incremental builds has overhead, not useful in CI.
WORKFLOW_URL: https://github.com/maidsafe/stableset_net/actions/runs
jobs:
e2e:
name: E2E tests
runs-on: ${{ matrix.os }}
strategy:
matrix:
include:
- os: ubuntu-latest
autonomi_path: /home/runner/.local/share/autonomi
- os: windows-latest
autonomi_path: C:\\Users\\runneradmin\\AppData\\Roaming\\autonomi
- os: macos-latest
autonomi_path: /Users/runner/Library/Application\ Support/autonomi
steps:
- uses: actions/checkout@v4
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
continue-on-error: true
- name: Build binaries
run: cargo build --release --bin antnode --bin ant
timeout-minutes: 30
- name: Start a local network
uses: maidsafe/ant-local-testnet-action@main
with:
action: start
enable-evm-testnet: true
node-path: target/release/antnode
platform: ${{ matrix.os }}
build: true
- name: Check if ANT_PEERS and EVM_NETWORK are set
shell: bash
run: |
if [[ -z "$ANT_PEERS" ]]; then
echo "The ANT_PEERS variable has not been set"
exit 1
elif [[ -z "$EVM_NETWORK" ]]; then
echo "The EVM_NETWORK variable has not been set"
exit 1
else
echo "ANT_PEERS has been set to $ANT_PEERS"
echo "EVM_NETWORK has been set to $EVM_NETWORK"
fi
# only these unit tests require a network, the rest are run above in unit test section
- name: Run autonomi --tests
run: cargo test --package autonomi --tests -- --nocapture
env:
ANT_LOG: "v"
# only set the target dir for windows to bypass the linker issue.
# happens if we build the node manager via testnet action
CARGO_TARGET_DIR: ${{ matrix.os == 'windows-latest' && './test-target' || '.' }}
timeout-minutes: 15
# FIXME: do this in a generic way for localtestnets
- name: export default secret key
if: matrix.os != 'windows-latest'
run: echo "SECRET_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" >> $GITHUB_ENV
shell: bash
- name: Set secret key for Windows
if: matrix.os == 'windows-latest'
run: echo "SECRET_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
shell: pwsh
- name: Get file cost
run: ./target/release/ant --log-output-dest=data-dir file cost "./resources"
env:
ANT_LOG: "v"
timeout-minutes: 15
- name: File upload
run: ./target/release/ant --log-output-dest=data-dir file upload "./resources" > ./upload_output 2>&1
env:
ANT_LOG: "v"
timeout-minutes: 15
- name: parse address (unix)
if: matrix.os != 'windows-latest'
run: |
UPLOAD_ADDRESS=$(rg "At address: ([0-9a-f]*)" -o -r '$1' ./upload_output)
echo "UPLOAD_ADDRESS=$UPLOAD_ADDRESS" >> $GITHUB_ENV
shell: bash
- name: parse address (win)
if: matrix.os == 'windows-latest'
run: |
$UPLOAD_ADDRESS = rg "At address: ([0-9a-f]*)" -o -r '$1' ./upload_output
echo "UPLOAD_ADDRESS=$UPLOAD_ADDRESS" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
shell: pwsh
- name: File Download
run: ./target/release/ant --log-output-dest=data-dir file download ${{ env.UPLOAD_ADDRESS }} ./downloaded_resources
env:
ANT_LOG: "v"
timeout-minutes: 5
- name: Stop the local network and upload logs
if: always()
uses: maidsafe/ant-local-testnet-action@main
with:
action: stop
log_file_prefix: safe_test_logs_e2e
platform: ${{ matrix.os }}
- name: post notification to slack on failure
if: ${{ failure() }}
uses: bryannice/[email protected]
env:
SLACK_INCOMING_WEBHOOK: ${{ secrets.SLACK_GH_ACTIONS_WEBHOOK_URL }}
SLACK_MESSAGE: "Please check the logs for the run at ${{ env.WORKFLOW_URL }}/${{ github.run_id }}"
SLACK_TITLE: "Nightly E2E Test Run Failed"
full_unit:
name: Full Unit Tests (including proptests)
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout@v4
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
continue-on-error: true
- name: Build unit tests before running
run: cargo test --release --lib --bins --no-run
timeout-minutes: 30
- name: Run autonomi tests
timeout-minutes: 25
run: cargo test --release --package autonomi --lib --features="full"
- name: Run bootstrap tests
timeout-minutes: 25
run: cargo test --release --package ant-bootstrap
- name: Run launchpad tests
timeout-minutes: 25
run: cargo test --release --package node-launchpad
- name: Run node tests
timeout-minutes: 25
run: cargo test --release --package ant-node --lib
- name: Run network tests
timeout-minutes: 25
run: cargo test --release --package ant-networking --features="open-metrics"
- name: Run protocol tests
timeout-minutes: 25
run: cargo test --release --package ant-protocol
- name: Run logging tests
timeout-minutes: 25
run: cargo test --release --package ant-logging
- name: post notification to slack on failure
if: ${{ failure() }}
uses: bryannice/[email protected]
env:
SLACK_INCOMING_WEBHOOK: ${{ secrets.SLACK_GH_ACTIONS_WEBHOOK_URL }}
SLACK_MESSAGE: "Please check the logs for the run at ${{ env.WORKFLOW_URL }}/${{ github.run_id }}"
SLACK_TITLE: "Nightly Unit Test Run Failed"
churn:
name: Network churning tests
runs-on: ${{ matrix.os }}
strategy:
matrix:
include:
- os: ubuntu-latest
node_data_path: /home/runner/.local/share/autonomi/node
autonomi_path: /home/runner/.local/share/autonomi
- os: windows-latest
node_data_path: C:\\Users\\runneradmin\\AppData\\Roaming\\autonomi\\node
autonomi_path: C:\\Users\\runneradmin\\AppData\\Roaming\\autonomi
- os: macos-latest
node_data_path: /Users/runner/Library/Application Support/autonomi/node
autonomi_path: /Users/runner/Library/Application Support/autonomi
steps:
- uses: actions/checkout@v4
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
continue-on-error: true
- name: Build binaries
run: cargo build --release --bin antnode
timeout-minutes: 30
- name: Build churn tests
run: cargo test --release -p ant-node --test data_with_churn --no-run
env:
# only set the target dir for windows to bypass the linker issue.
# happens if we build the node manager via testnet action
CARGO_TARGET_DIR: ${{ matrix.os == 'windows-latest' && './test-target' || '.' }}
timeout-minutes: 30
- name: Start a local network
uses: maidsafe/ant-local-testnet-action@main
with:
action: start
enable-evm-testnet: true
node-path: target/release/antnode
platform: ${{ matrix.os }}
build: true
- name: Chunks data integrity during nodes churn (during 10min) (in theory)
run: cargo test --release -p ant-node --test data_with_churn -- --nocapture
env:
TEST_DURATION_MINS: 60
TEST_CHURN_CYCLES: 6
ANT_LOG: "all"
CARGO_TARGET_DIR: ${{ matrix.os == 'windows-latest' && './test-target' || '.' }}
timeout-minutes: 90
- name: Stop the local network and upload logs
if: always()
uses: maidsafe/ant-local-testnet-action@main
with:
action: stop
log_file_prefix: safe_test_logs_churn
platform: ${{ matrix.os }}
- name: Get total node count
shell: bash
timeout-minutes: 1
run: |
node_count=$(ls "${{ matrix.node_data_path }}" | wc -l)
echo "Node dir count is $node_count"
- name: Get restart of nodes using rg
shell: bash
timeout-minutes: 1
# get the counts, then the specific line, and then the digit count only
# then check we have an expected level of restarts
# TODO: make this use an env var, or relate to testnet size
run: |
restart_count=$(rg "Node is restarting in" "${{ matrix.node_data_path }}" -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Restarted $restart_count nodes"
- name: Get peers removed from nodes using rg
shell: bash
timeout-minutes: 1
run: |
peer_removed=$(rg "PeerRemovedFromRoutingTable" "${{ matrix.node_data_path }}" -c --stats | \
rg "(\d+) matches" | rg "\d+" -o) || { echo "Failed to extract peer removal count"; exit 1; }
if [ -z "$peer_removed" ]; then
echo "No peer removal count found"
exit 1
fi
echo "PeerRemovedFromRoutingTable $peer_removed times"
- name: Verify peers removed exceed restarted node counts
shell: bash
timeout-minutes: 1
# get the counts, then the specific line, and then the digit count only
# then check we have an expected level of restarts
# TODO: make this use an env var, or relate to testnet size
run: |
restart_count=$(rg "Node is restarting in" "${{ matrix.node_data_path }}" -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Restart $restart_count nodes"
peer_removed=$(rg "PeerRemovedFromRoutingTable" "${{ matrix.node_data_path }}" -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "PeerRemovedFromRoutingTable $peer_removed times"
if [ $peer_removed -lt $restart_count ]; then
echo "PeerRemovedFromRoutingTable times of: $peer_removed is less than the restart count of: $restart_count"
exit 1
fi
# TODO: reenable this once the testnet dir creation is tidied up to avoid a large count here
# if [ $restart_count -lt $node_count ]; then
# echo "Restart count of: $restart_count is less than the node count of: $node_count"
# exit 1
# fi
- name: Verify data replication using rg
shell: bash
timeout-minutes: 1
# get the counts, then the specific line, and then the digit count only
# then check we have an expected level of replication
# TODO: make this use an env var, or relate to testnet size
run: |
fetching_attempt_count=$(rg "FetchingKeysForReplication" "${{ matrix.node_data_path }}" -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Carried out $fetching_attempt_count fetching attempts"
node_count=$(ls "${{ matrix.node_data_path }}" | wc -l)
if [ $fetching_attempt_count -lt $node_count ]; then
echo "Replication fetching attempts of: $fetching_attempt_count is less than the node count of: $node_count"
exit 1
fi
- name: post notification to slack on failure
if: ${{ failure() }}
uses: bryannice/[email protected]
env:
SLACK_INCOMING_WEBHOOK: ${{ secrets.SLACK_GH_ACTIONS_WEBHOOK_URL }}
SLACK_MESSAGE: "Please check the logs for the run at ${{ env.WORKFLOW_URL }}/${{ github.run_id }}"
SLACK_TITLE: "Nightly Churn Test Run Failed"
# Only error out after uploading the logs
- name: Don't log raw data
if: matrix.os != 'windows-latest' # causes error
shell: bash
timeout-minutes: 10
run: |
if ! rg '^' "${{ matrix.autonomi_path }}"/*/*/logs | awk 'length($0) > 15000 { print; exit 1 }'
then
echo "We are logging an extremely large data"
exit 1
fi
verify_data_location_routing_table:
name: Verify data location and Routing Table
runs-on: ${{ matrix.os }}
strategy:
matrix:
include:
- os: ubuntu-latest
node_data_path: /home/runner/.local/share/autonomi/node
autonomi_path: /home/runner/.local/share/autonomi
- os: windows-latest
node_data_path: C:\\Users\\runneradmin\\AppData\\Roaming\\autonomi\\node
autonomi_path: C:\\Users\\runneradmin\\AppData\\Roaming\\autonomi
- os: macos-latest
node_data_path: /Users/runner/Library/Application Support/autonomi/node
autonomi_path: /Users/runner/Library/Application Support/autonomi
steps:
- uses: actions/checkout@v4
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
continue-on-error: true
- name: Build binaries
run: cargo build --release --bin antnode
timeout-minutes: 30
- name: Build data location and routing table tests
run: cargo test --release -p ant-node --test verify_data_location --test verify_routing_table --no-run
env:
# only set the target dir for windows to bypass the linker issue.
# happens if we build the node manager via testnet action
CARGO_TARGET_DIR: ${{ matrix.os == 'windows-latest' && './test-target' || '.' }}
timeout-minutes: 30
- name: Start a local network
uses: maidsafe/ant-local-testnet-action@main
with:
action: start
enable-evm-testnet: true
node-path: target/release/antnode
platform: ${{ matrix.os }}
build: true
- name: Verify the Routing table of the nodes
run: cargo test --release -p ant-node --test verify_routing_table -- --nocapture
env:
CARGO_TARGET_DIR: ${{ matrix.os == 'windows-latest' && './test-target' || '.' }}
timeout-minutes: 5
- name: Verify the location of the data on the network
run: cargo test --release -p ant-node --test verify_data_location -- --nocapture
env:
ANT_LOG: "all"
CARGO_TARGET_DIR: ${{ matrix.os == 'windows-latest' && './test-target' || '.' }}
timeout-minutes: 90
- name: Verify the routing tables of the nodes
run: cargo test --release -p ant-node --test verify_routing_table -- --nocapture
env:
CARGO_TARGET_DIR: ${{ matrix.os == 'windows-latest' && './test-target' || '.' }}
timeout-minutes: 5
- name: Stop the local network and upload logs
if: always()
uses: maidsafe/ant-local-testnet-action@main
with:
action: stop
log_file_prefix: safe_test_logs_data_location
platform: ${{ matrix.os }}
- name: Verify restart of nodes using rg
shell: bash
timeout-minutes: 1
# get the counts, then the specific line, and then the digit count only
# then check we have an expected level of restarts
# TODO: make this use an env var, or relate to testnet size
run: |
restart_count=$(rg "Node is restarting in" "${{ matrix.node_data_path }}" -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "Restart $restart_count nodes"
peer_removed=$(rg "PeerRemovedFromRoutingTable" "${{ matrix.node_data_path }}" -c --stats | \
rg "(\d+) matches" | rg "\d+" -o)
echo "PeerRemovedFromRoutingTable $peer_removed times"
if [ $peer_removed -lt $restart_count ]; then
echo "PeerRemovedFromRoutingTable times of: $peer_removed is less than the restart count of: $restart_count"
exit 1
fi
node_count=$(ls "${{ matrix.node_data_path }}" | wc -l)
echo "Node dir count is $node_count"
- name: post notification to slack on failure
if: ${{ failure() }}
uses: bryannice/[email protected]
env:
SLACK_INCOMING_WEBHOOK: ${{ secrets.SLACK_GH_ACTIONS_WEBHOOK_URL }}
SLACK_MESSAGE: "Please check the logs for the run at ${{ env.WORKFLOW_URL }}/${{ github.run_id }}"
SLACK_TITLE: "Nightly Data Location Test Run Failed"
# Only error out after uploading the logs
- name: Don't log raw data
if: matrix.os != 'windows-latest' # causes error
shell: bash
timeout-minutes: 10
run: |
if ! rg '^' "${{ matrix.autonomi_path }}"/*/*/logs | awk 'length($0) > 15000 { print; exit 1 }'
then
echo "We are logging an extremely large data"
exit 1
fi