Skip to content

Commit

Permalink
Reduce size of test db, anonymize unneeded fields
Browse files Browse the repository at this point in the history
  • Loading branch information
alastair committed Oct 31, 2023
1 parent cf01db8 commit 0c69299
Showing 1 changed file with 128 additions and 22 deletions.
150 changes: 128 additions & 22 deletions general/management/commands/prune_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,34 @@
#


import datetime
import logging
import os
import random

from django.contrib.auth.models import User
from django.contrib.auth.models import User, Group
from django.contrib.admin.models import LogEntry
from django.contrib.sessions.models import Session
from django.core.management.base import BaseCommand
from django.core.management import call_command
from django.db import connection
from django.db.models.signals import post_delete, pre_delete, pre_save, post_save
from silk.models import Request
from accounts.models import DeletedUser, EmailBounce, GdprAcceptance, OldUsername, Profile, ResetEmailRequest, SameUser, UserDeletionRequest, UserEmailSetting, UserFlag
from apiv2.models import ApiV2Client
from donations.models import Donation
from forum.models import Subscription
from general.models import AkismetSpam
from messages.models import MessageBody
from oauth2_provider.models import AccessToken, Application, Grant, RefreshToken

import comments
import forum
import ratings
import sounds
from sounds.models import BulkUploadProgress, DeletedSound, Flag
import tickets
from tickets.models import Ticket, TicketComment

console_logger = logging.getLogger('console')

Expand All @@ -43,33 +58,34 @@ def chunks(l, n):


class Command(BaseCommand):
help = "Delete most of the database to make it smaller"
help = "Delete most of the database to make it smaller for development, and anonymise it"

def add_arguments(self, parser):
parser.add_argument(
'-d', '--keep-downloaders',
dest='downloaders',
default=120000,
default=100000,
type=int,
help='The number of downloaders to keep')

parser.add_argument(
'-u', '-keep-uploaders',
dest='uploaders',
default=10,
'-s', '--keep-sounds',
dest='sounds',
default=5000,
type=int,
help='Percentage of uploaders to keep'
help='Number of sounds to keep'
)

def disconnect_signals(self):
"""Disconnect django signals that update aggregate counts when items are modified. We re-compute
these counts as a separate step"""
post_save.disconnect(forum.models.update_num_threads_on_thread_insert, sender=forum.models.Thread)
pre_save.disconnect(forum.models.update_num_threads_on_thread_update, sender=forum.models.Thread)
pre_save.disconnect(forum.models.index_posts_on_thread_update, sender=forum.models.Thread)
post_delete.disconnect(forum.models.update_last_post_on_thread_delete, sender=forum.models.Thread)
pre_save.disconnect(forum.models.update_num_posts_on_save_if_moderation_changes, sender=forum.models.Post)
post_save.disconnect(forum.models.update_num_posts_on_post_insert, sender=forum.models.Post)
post_delete.disconnect(forum.models.update_last_post_on_post_delete, sender=forum.models.Post)
post_delete.disconnect(forum.models.update_thread_on_post_delete, sender=forum.models.Post)
post_delete.disconnect(ratings.models.post_delete_rating, sender=ratings.models.SoundRating)
post_save.disconnect(ratings.models.update_num_ratings_on_post_save, sender=ratings.models.SoundRating)

Expand All @@ -95,23 +111,40 @@ def delete_some_users(self, userids):
cursor.execute("delete from sounds_packdownloadsound where pack_download_id in (select id from sounds_packdownload where user_id in %s)", [tuple(userids)])
cursor.execute("delete from sounds_packdownload where user_id in %s", [tuple(userids)])
console_logger.info(' - done, user objects')
# This will delete some other related data, but it's not as slow as deleting downloads.
# so we let django do it
User.objects.filter(id__in=userids).delete()
console_logger.info(' - done')

def delete_sound_uploaders(self, pkeep):
"""Delete some percentage of users who have uploaded sounds
def delete_sound_uploaders(self, numkeep):
"""Delete users who have uploaded sounds, keeping at least `numkeep` sounds.
Don't consider users who have uploaded more than 1000 sounds.
Arguments:
pkeep: the percentage of uploaders to keep
numkeep: the number of sounds to keep
"""
console_logger.info('Deleting some uploaders')
userids = User.objects.values_list('id', flat=True).filter(profile__num_sounds__gt=0)
numusers = len(userids)
# If a user has more than this number of sounds then don't add them
# (we want the number of users with sounds to be varied)
max_number_of_sounds_per_user = 100
users = User.objects.values_list('id', 'profile__num_sounds').filter(profile__num_sounds__gt=0)
users = list(users)
all_users_with_sounds = [u[0] for u in users]
random.shuffle(users)
numusers = len(users)
console_logger.info(f'Number of uploaders: {numusers}')
percentage_remove = 1.0 - (pkeep / 100.0)
randusers = sorted(random.sample(userids, int(numusers*percentage_remove)))
ch = [c for c in chunks(randusers, 100)]

totalsounds = 0
userids = []
# Add random users until the number of total sounds is approximately `numkeep`
for u, numsounds in users:
if numsounds <= max_number_of_sounds_per_user:
userids.append(u)
totalsounds += numsounds
if totalsounds > numkeep:
break

console_logger.info(f"Keeping {len(userids)} users with {totalsounds} sounds")

users_not_in_userids = list(set(all_users_with_sounds) - set(userids))
ch = [c for c in chunks(sorted(list(users_not_in_userids)), 1000)]
tot = len(ch)
for i, c in enumerate(ch, 1):
console_logger.info(f' {i}/{tot}')
Expand All @@ -130,17 +163,90 @@ def delete_downloaders(self, numkeep):
random.shuffle(userids)
# Keep `numkeep` users, and delete the rest
randusers = sorted(userids[numkeep:])
ch = [c for c in chunks(randusers, 100000)]
ch = [c for c in chunks(randusers, 10000)]
tot = len(ch)
for i, c in enumerate(ch, 1):
console_logger.info(f' {i}/{tot}')
self.delete_some_users(c)

def handle(self, *args, **options):
def anonymise_database(self):
users_to_update = []
for user in User.objects.all():
user.email = str(user.id) + '@freesound.org'
user.first_name = ''
user.last_name = ''
# this password is 'freesound'
user.password = 'pbkdf2_sha256$36000$PJRTmkaiwSEC$a8+HUj33133PZX7ToOuypT/CfLKNwMeJMXqBJ4QbQPg='
user.is_staff = False
user.is_superuser = False
users_to_update.append(user)
User.objects.bulk_update(users_to_update, ['email', 'first_name', 'last_name', 'password', 'is_staff', 'is_superuser'])

MessageBody.objects.all().update(body='(message body) Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.')
TicketComment.objects.all().update(text='(ticket comment) Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.')

GdprAcceptance.objects.all().update(date_accepted=datetime.datetime.now())

Profile.objects.all().update(
is_whitelisted=False,
not_shown_in_online_users_list=False,
last_stream_email_sent=None,
last_attempt_of_sending_stream_email=None,
is_adult=False,
last_donation_email_sent=None,
donations_reminder_email_sent=False,
)

# Bookmarks? Possibly set created to now()
# Messages (link between 2 users)

User.objects.all().update(last_login=datetime.datetime.now(), date_joined=datetime.datetime.now())
for group in Group.objects.all():
group.user_set.clear()

Ticket.objects.filter(sender=None).delete()

def delete_unneeded_tables(self):
Session.objects.all().delete()
AccessToken.objects.all().delete()
Application.objects.all().delete()
Grant.objects.all().delete()
RefreshToken.objects.all().delete()
OldUsername.objects.all().delete()
ApiV2Client.objects.all().delete()
DeletedUser.objects.all().delete()
EmailBounce.objects.all().delete()
ResetEmailRequest.objects.all().delete()
SameUser.objects.all().delete()
UserDeletionRequest.objects.all().delete()
UserEmailSetting.objects.all().delete()
UserFlag.objects.all().delete()

LogEntry.objects.all().delete()
Donation.objects.all().delete()
# Forum
Subscription.objects.all().delete()
AkismetSpam.objects.all().delete()

# Silk, deletes request, response, associated sql queries
Request.objects.all().delete()

BulkUploadProgress.objects.all().delete()
DeletedSound.objects.all().delete()
Flag.objects.all().delete()

def handle(self, *args, **options):
if os.environ.get('FREESOUND_PRUNE_DATABASE') != '1':
raise Exception('Run this command with env FREESOUND_PRUNE_DATABASE=1 to confirm you want to prune the database')

self.disconnect_signals()
self.delete_unneeded_tables()
# Delete downloaders first, this will remove the majority of the download table
# so that when we delete uploaders, there are not as many download rows for other
# users who have downloaded these uploaders' sounds
self.delete_downloaders(options['downloaders'])
self.delete_sound_uploaders(options['uploaders'])

self.delete_sound_uploaders(options['sounds'])
self.anonymise_database()
# Update counts
call_command('report_count_statuses')
call_command('set_first_post_in_threads')

0 comments on commit 0c69299

Please sign in to comment.