Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plugin submission to monitor s3 bucket size/object count #1326

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
318 changes: 318 additions & 0 deletions plugins/s3_bucket_size/s3_____multi
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
#!/usr/bin/env python3

""" Munin plugin to monitor the size and file numbers of a bucket in a S3 compatible storage

=head1 Name

s3_____multi

This plugin should be linked with a name like this

s3_<endpoint>_<region>_<bucket>_<folder>_multi

Where:
- endpoint is the s3 endpoint. Ex: s3.eu-west-3.amazonaws.com
- region is the s3 region. Ex: eu-west-3
- bucket is the name of your bucket
- folder is optional.
If you specify a folder, you will monitor the size of folders inside the specified folder instead of the size of folders at the root of the bucket
folder can only be the name of a folder at the root location of the bucket

Ex: ln -s /path/to/s3_____multi /etc/munin/plugins/s3_s3.eu-west-3.amazonaws.com_eu-west-3_bucket1__multi

=head1 CONFIGURATION

Following config is needed:

[s3_<endpoint>_<region>_<bucket>_*]
env.access_key_id ACCESS_KEY
env.secret_access_key SECRET_ACCESS_KEY

Following config is optional

user munin
env.s3hostname 1

running as munin is optional, but if your default user is nobody, you may end up with a write permission erreur when running the plugin with the update_cache parameter
setting env.s3hostname to any value, will make the plugin to be advertising itself as running on <endpoint>, creating a dedicated entry in munin host list
If doing so, you MUST update your munin.conf file on the munin master with the following entry

[<endpoint>]
address <hostname of munin-node server running the script>
use_node_name no

Ex:
[s3.eu-west-3.amazonaws.com]
address myserver.mydomain.tld
use_node_name no

Getting the size of a bucket can be (very) long depending of the bucket size.
The script will not perform the actual check every time munin fetch data (every 5m). At fetch time, it gets data from a local cache

You MUST run the script by yourself to update this cache. To do so, you may want to use a cron entry
You MUST run the script with munin-run so that the script run with the right user, and get all the environment variable (including MUNIN_PLUGSTATE, MUNIN_CAP_MULTIGRAPH)

Typical command run by cron would be
sudo -u munin /usr/sbin/munin-run -d s3_s3.eu-west-3.amazonaws.com_eu-west-3_bucket1__multi update_cache

IMPORTANT: You will not get any grpah using you have run the script with the update_cache parameter

=head1 Requirements

Pyhton 3
boto3 module (pip3 install boto3)

=head1 Todo

Support invocation without bucket name (s3_<endpoint>_<region>___multi) and get a graph with the size/object count of all buckets

=head1 AUTHOR

Jean-Edouard Babin
https://github.com/jebabin/munin_s3_bucket_size

=head1 LICENSE

GPLv2

=head1 MAGIC MARKERS

#%# capabilities=multigraph

=cut
"""


import json
import subprocess
import os
import sys
import stat
import re
import boto3

# boto3.set_stream_logger('')

""" This is from a preliminary version which was using the s3cmd tool instead of the boto3 lib

def get_folder_list_s3cmd():
process = subprocess.run(['s3cmd', 'ls', 's3://'+bucket + rootdir + '/'], stdout=subprocess.PIPE)
return process.stdout.decode('utf-8')


def get_folder_info_s3cmd(folder):
process = subprocess.run(['s3cmd', 'du', 's3://'+bucket + rootdir + '/' + folder + '/'], stdout=subprocess.PIPE)
return process.stdout.decode('utf-8')


def update_cache_s3cmd(cache_path):
folders = get_folder_list_s3cmd()

folder_dict = {}
for line in folders.split('\n'):
if not line.strip():
continue
match = re.search(r"^\s+DIR\s+.*?\/([^\/]+)\/$", line)
if match is not None:
folder = match.group(1)

folder_info = get_folder_info_s3cmd(folder).split('\n')[0]
# Create the dict entry even if later the command fail to ensure "config" list all
folder_dict[folder] = {}
match = re.search(r"^\s*(\d+)\s+(\d+)", folder_info)
if match is not None:
size = match.group(1)
object = match.group(2)
folder_dict[folder]['size'] = size
folder_dict[folder]['object'] = object

with open(cache_path, 'w') as cache_file:
cache_file.write(json.dumps(folder_dict))

"""

def update_cache(cache_path):
s3r = boto3.resource('s3', region_name=region, endpoint_url="https://"+host, aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key)
s3_bucket = s3r.Bucket(bucket)

total_size = 0
folder_dict = {}
for object in s3_bucket.objects.filter(Prefix=rootdir + "/"):
print(object.key)
obj_path = re.sub('^' + rootdir + '/', '', object.key)
folder = obj_path.split('/')[0]
print(folder)
if (folder == ""):
continue
if folder in folder_dict:
folder_dict[folder]['size'] += object.size
folder_dict[folder]['object'] += 1
else:
folder_dict[folder] = {}
folder_dict[folder]['size'] = object.size
folder_dict[folder]['object'] = 1

# with open(cache_path, 'w') as cache_file:
# cache_file.write(json.dumps(folder_dict))


def read_cache(cache_path):
if os.path.isfile(cache_path):
with open(cache_path) as json_file:
data = json.load(json_file)
return data
else:
return None


def normalize_name(name):
normal_first = re.sub(r'^[^A-Za-z_]', r'_', name)
return re.sub(r'[^A-Za-z0-9_]', r'_', normal_first)


# Exit if multigraph not supported
is_multigraph_capable = os.getenv('MUNIN_CAP_MULTIGRAPH')
if is_multigraph_capable is None:
sys.exit(1)

# init vars
use_s3hostname = None
host = None
region = None
bucket = None
access_key_id = None
secret_access_key = None
rootdir = ""

# deduct vars from file name
try:
# s3_<endpoint>_<region>_<bucket>_<folder>_multi
match = re.search(r"^(?:|.*\/)s3_([^_]+)_([^_]+)_([^_]+)_([^_]*)_multi$", sys.argv[0])
if match is not None:
host = match.group(1)
region = match.group(2)
bucket = match.group(3)
rootdir = match.group(4)
else:
print("File name doesn't have the exceptect format: s3_<endpoint>_<region>_<bucket>_<folder>_multi")
sys.exit(2)
except Exception as ex:
logging.error("Caught exception: %s" % ex)

# set s3 creds
access_key_id = os.getenv('access_key_id')
secret_access_key = os.getenv('secret_access_key')

if access_key_id is None:
print('access_key_id environement variable is not defined.')
sys.exit(3)
if secret_access_key is None:
print('secret_access_key environement variable is not defined.')
sys.exit(4)

# use server or s3 hostname ?
use_s3hostname = os.getenv('s3hostname')

tmpfile = os.getenv('MUNIN_PLUGSTATE') + "/s3_"+host+"_"+region+"_"+bucket+"_"+rootdir+".cache"


if len(sys.argv) == 2:
if sys.argv[1] == "config":
if use_s3hostname is not None:
print('host_name %s' % host)
data = read_cache(tmpfile)
if data is None:
sys.exit(0)
# Size
print('multigraph %s_size' % normalize_name("s3_"+host+"_"+region+"_"+bucket+"_"+rootdir))
print('graph_category Disk')
if (rootdir == ""):
print('graph_title Size of bucket %s' % bucket)
else:
print('graph_title Size of folder %s in bucket %s' % (rootdir, bucket))
print('graph_vlabel bytes')
i = 0
for folder in data:
print('%s.label %s' % (normalize_name(folder), folder[0:45]))
if i == 0:
print('%s.draw AREA' % normalize_name(folder))
i = 1
else:
print('%s.draw STACK' % normalize_name(folder))
print('total.label Total')
print('total.draw LINE1')

# Size per folder
for folder in data:
print('multigraph %s_size.%s' % (normalize_name("s3_"+host+"_"+region+"_"+bucket+"_"+rootdir), normalize_name(folder)))
print('data.label %s' % folder[0:45])
print('graph_category Disk')
if (rootdir == ""):
print('graph_title Folder size inside bucket %s' % bucket)
else:
print('graph_title Folder size inside folder %s of bucket %s' % (rootdir, bucket))
print('graph_vlabel bytes')
print('data.draw LINE1')

# Object
print('multigraph %s_object' % normalize_name("s3_"+host+"_"+region+"_"+bucket+"_"+rootdir))
print('graph_category Disk')
if (rootdir == ""):
print('graph_title Objects in bucket %s' % bucket)
else:
print('graph_title Objects in folder %s of bucket %s' % (rootdir, bucket))
print('graph_vlabel # of objects')
i = 0
for folder in data:
print('%s.label %s' % (normalize_name(folder), folder[0:45]))
if i == 0:
print('%s.draw AREA' % normalize_name(folder))
i = 1
else:
print('%s.draw STACK' % normalize_name(folder))
print('total.label Total')
print('total.draw LINE1')

# Object per folder
for folder in data:
print('multigraph %s_object.%s' % (normalize_name("s3_"+host+"_"+region+"_"+bucket+"_"+rootdir), normalize_name(folder)))
print('data.label %s' % folder[0:45])
print('graph_category Disk')
if (rootdir == ""):
print('graph_title Folder objects inisde bucket %s' % bucket)
else:
print('graph_title Folder objects inside folder %s of bucket %s' % (rootdir, bucket))
print('graph_vlabel # of objects')
print('data.draw LINE1')


if sys.argv[1] == "update_cache":
update_cache(tmpfile)

else:
data = read_cache(tmpfile)
if data is None:
sys.exit(1)
size_total = 0
object_total = 0
for folder in data:
size_total = size_total + int(data[folder]['size'])
object_total = object_total + int(data[folder]['object'])

print('multigraph %s_size' % normalize_name("s3_"+host+"_"+region+"_"+bucket+"_"+rootdir))
for folder in data:
print('%s.value %s' % (normalize_name(folder), data[folder]['size']))
print('total.value %s' % size_total)
for folder in data:
print('multigraph %s_size.%s' % (normalize_name("s3_"+host+"_"+region+"_"+bucket+"_"+rootdir), normalize_name(folder)))
print('data.value %s' % data[folder]['size'])

print('multigraph %s_object' % normalize_name("s3_"+host+"_"+region+"_"+bucket+"_"+rootdir))
for folder in data:
print('%s.value %s' % (normalize_name(folder), data[folder]['object']))
print('data.value %s' % object_total)
for folder in data:
print('multigraph %s_object.%s' % (normalize_name("s3_"+host+"_"+region+"_"+bucket+"_"+rootdir), normalize_name(folder)))
print('data.value %s' % data[folder]['object'])