Skip to content

Commit

Permalink
updating script
Browse files Browse the repository at this point in the history
  • Loading branch information
pipparichter committed Jan 10, 2025
1 parent 6019507 commit cae8d34
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 178 deletions.
9 changes: 5 additions & 4 deletions notebooks/gtdb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -62,21 +62,22 @@
"metadata_subset_df.append(metadata_df.drop_duplicates(subset='phylum', keep='first'))\n",
"\n",
"metadata_subset_df = pd.concat(metadata_subset_df)\n",
"metadata_subset_df['prefix'] = [genome_id[:2] for genome_id in metadata_subset_df.genome_id] \n",
"metadata_subset_df.genome_id = [genome_id.replace('GB_', '').replace('RS_', '') for genome_id in metadata_subset_df.genome_id] # Remove the prefixes from the genome IDs. \n",
"metadata_subset_df.set_index('genome_id').to_csv('../data/gtdb_subset.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Generate a shell script to run on HPC to extract all genomes, proteins, and annotations from the complete GTDB directories to subset directories. \n",
"script_path = '../scripts/bash/move-gtdb-subset-files.sh'\n",
"with open(script_path, 'w') as f:\n",
" for genome_id in metadata_subset_df.genome_id:\n",
" cmd = f'grep -l {genome_id} ./gtdb_genomes/* | while read file_name; do mv \"$file_name\" ./gtdb_subset_genomes/; done'\n",
" for row in metadata_subset_df.itertuples():\n",
" cmd = f'mv ./gtdb_genomes/{row.prefix}_{row.genome_id}_genomic.fna.gz -t ./gtdb_subset_genomes/'\n",
" # f.write(cmd + '\\n')\n",
" # cmd = f'grep -l {genome_id} ./gtdb_proteins/ | while read file_name; do mv \"$file_name\" ./gtdb_subset_proteins/; done'\n",
" # f.write(cmd + '\\n')\n",
Expand Down
Loading

0 comments on commit cae8d34

Please sign in to comment.