Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add slurm script #7

Merged
merged 7 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ kg-microbe-biomedical:
kg-microbe-biomedical-function-merge:
poetry run kg merge -m duckdb -n 1000000 -e 100000 -s "bacdive, mediadive, madin_etal, rhea_mappings, bactotraits, chebi, ec, envo, go, ncbitaxon, upa, hp, mondo, ctd, wallen_etal, uniprot_human, uniprot_functional_microbes" --merge-label $@

include kg-microbe-merge.Makefile
include kg-microbe-merge.Makefile

2 changes: 1 addition & 1 deletion download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
#
-
url: git://Knowledge-Graph-Hub/kg-microbe/BactoTraits.tar.gz
local_name: BactoTraits.tar.gz
local_name: bactotraits.tar.gz

#
# KG-Microbe [CTD]
Expand Down
32 changes: 32 additions & 0 deletions hpc/run_parallel_merge.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH --account=m4689
#SBATCH --qos=regular
#SBATCH --constraint=cpu
#SBATCH --time=360
#SBATCH --ntasks=1
#SBATCH --mem=425GB
#SBATCH --job-name=parallel_merge
#SBATCH --output=parallel_merge_%A_%a.out
#SBATCH --error=parallel_merge_%A_%a.err
#SBATCH --array=0-3
#SBATCH -N 1

module load python/3.10
cd kg-microbe-merge
python -m venv venv-merge
source venv-merge/bin/activate
pip install poetry
poetry install

# Array of merged graph names
merges=(
kg-microbe-core
kg-microbe-biomedical
)

# Get the merge for this job array task
merge=${merges[$SLURM_ARRAY_TASK_ID]}

echo "Starting $merge"
time poetry run make $merge
echo "Finished $merge"
33 changes: 33 additions & 0 deletions hpc/run_parallel_merge_biomedical_function.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
#SBATCH --account=m4689
#SBATCH --qos=regular
#SBATCH --constraint=cpu
#SBATCH --time=360
#SBATCH --ntasks=1
#SBATCH --mem=425GB
#SBATCH --job-name=parallel_merge
#SBATCH --output=parallel_merge_%A_%a.out
#SBATCH --error=parallel_merge_%A_%a.err
#SBATCH --array=0
#SBATCH -N 1
#SBATCH --mail-type=BEGIN,END
#SBATCH [email protected]

module load python/3.10
cd kg-microbe-merge
python -m venv venv-merge
source venv-merge/bin/activate
pip install poetry
poetry install

# Array of merged graph names
merges=(
kg-microbe-biomedical-function
)

# Get the merge for this job array task
merge=${merges[$SLURM_ARRAY_TASK_ID]}

echo "Starting $merge"
time poetry run make $merge
echo "Finished $merge"
33 changes: 33 additions & 0 deletions hpc/run_parallel_merge_function.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
#SBATCH --account=m4689
#SBATCH --qos=regular
#SBATCH --constraint=cpu
#SBATCH --time=360
#SBATCH --ntasks=1
#SBATCH --mem=425GB
#SBATCH --job-name=parallel_merge
#SBATCH --output=parallel_merge_%A_%a.out
#SBATCH --error=parallel_merge_%A_%a.err
#SBATCH --array=0
#SBATCH -N 1
#SBATCH --mail-type=BEGIN,END
#SBATCH [email protected]

module load python/3.10
cd kg-microbe-merge
python -m venv venv-merge
source venv-merge/bin/activate
pip install poetry
poetry install

# Array of merged graph names
merges=(
kg-microbe-function
)

# Get the merge for this job array task
merge=${merges[$SLURM_ARRAY_TASK_ID]}

echo "Starting $merge"
time poetry run make $merge
echo "Finished $merge"
10 changes: 7 additions & 3 deletions kg_microbe_merge/utils/duckdb_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Utility functions for working with DuckDB in the KG Microbe Merge project."""

import os
from pathlib import Path
from typing import List

import duckdb
Expand Down Expand Up @@ -304,7 +305,9 @@ def duckdb_nodes_merge(nodes_file_list, output_file, priority_sources, batch_siz
:param priority_sources: List of source names to prioritize.
"""
# Create a DuckDB connection
conn = duckdb.connect("nodes.db")
merge_label = Path(output_file).parent.name
nodes_db_file = f"{merge_label}_nodes.db"
conn = duckdb.connect(nodes_db_file)

# Load the files into DuckDB
load_into_duckdb(conn, nodes_file_list, "combined_nodes")
Expand Down Expand Up @@ -379,7 +382,7 @@ def duckdb_nodes_merge(nodes_file_list, output_file, priority_sources, batch_siz
finally:
# Close the connection
conn.close()
os.remove("nodes.db")
os.remove(nodes_db_file)


def duckdb_edges_merge(edges_file_list, output_file, batch_size=1000000):
Expand Down Expand Up @@ -416,7 +419,8 @@ def duckdb_edges_merge(edges_file_list, output_file, batch_size=1000000):
memory usage and allows for processing of very large datasets that exceed available RAM.
"""
os.makedirs(TMP_DIR, exist_ok=True)
db_file = "edges_persistent.db"
merge_label = Path(output_file).parent.name
db_file = f"{merge_label}_edges_persistent.db"
conn = duckdb.connect(db_file)

try:
Expand Down
Loading