Skip to content

Commit

Permalink
actual removal, now available in task
Browse files Browse the repository at this point in the history
  • Loading branch information
frimpongopoku committed Oct 19, 2023
1 parent 9638491 commit d553429
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 20 deletions.
49 changes: 48 additions & 1 deletion src/api/store/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import datetime
from functools import reduce
import io
from typing import List
from django.http import FileResponse
from _main_.settings import AWS_S3_REGION_NAME, AWS_STORAGE_BUCKET_NAME
from _main_.utils.common import serialize, serialize_all
Expand Down Expand Up @@ -530,4 +531,50 @@ def get_duplicate_count(grouped_dupes :dict):
for duplicates in grouped_dupes.values():
count +=len(duplicates) -1 # Subtracting 1 because one of them is the main one, and the remaining are duplicates

return count
return count

def group_disposable(original: dict, disposable: List[int]):
"""This function simply identifies which items in the disposable list share the same image as the original.
NB: because we initially did not dynamically label uploaded images (many many years ago.. lool) there are many scenarios where a user has uploaded the same image multiple times and because the image was the same (with name & everything), it always replaced the existing record in the bucket, but under new media records in the database.
So many duplicate images(media records) still share the same image reference with the original media record. This is why items that share the same image reference as the chosen original need to be identified and grouped.
Deletion for such records will be treated differently than other duplicates where the images are the same as the original, but are completely
different uploads in terms of the reference in the s3 bucket. In such cases the images in the s3 bucket will be removed as well!
"""

media = Media.objects.filter(pk=original.get("id")).first()
if not media:
return None, None
can_delete = Media.objects.filter(pk__in=disposable)
only_records = []
with_image = []
for m in can_delete:
if m.file.name == media.file.name:
only_records.append(m)
else:
with_image.append(m)

return only_records, with_image

def remove_duplicates_and_attach_relations(hash):
if not hash:
return None
dupes = Media.objects.filter(hash=hash)
relations = resolve_relations(dupes)
media_after_attaching = attach_relations_to_media(relations)
disposables = relations.get("disposable", [])
disposables = [m.get("id", None) for m in disposables]
media = relations.get("media", {})
del_only_records, del_with_image = group_disposable(media, disposables)
# print("disposables", disposables)
if del_only_records:
Media.objects.filter(
pk__in=[m.id for m in del_only_records]
).delete() # This only deletes records & doesnt fire the models "delete()" function which is what actually deletes actual image from the s3 bucket

if del_with_image:
for m in del_with_image:
m.delete() # will delete record and image from s3 bucket

return media_after_attaching

29 changes: 11 additions & 18 deletions src/api/store/media_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from database.utils.settings.model_constants.user_media_uploads import (
UserMediaConstants,
)
from api.store.common import calculate_hash_for_bucket_item
from api.store.common import calculate_hash_for_bucket_item, remove_duplicates_and_attach_relations
from _main_.utils.common import serialize
from _main_.utils.common import serialize, serialize_all
from api.store.common import (
Expand Down Expand Up @@ -71,23 +71,7 @@ def group_disposable(self, original: dict, disposable: List[int]):

def clean_duplicates(self, args, context: Context):
hash = args.get("hash", None)
dupes = Media.objects.filter(hash=hash)
relations = resolve_relations(dupes)
media_after_attaching = attach_relations_to_media(relations)
disposables = relations.get("disposable", [])
disposables = [m.get("id", None) for m in disposables]
media = relations.get("media", {})
del_only_records, del_with_image = self.group_disposable(media, disposables)
# print("disposables", disposables)
if del_only_records:
Media.objects.filter(
pk__in=[m.id for m in del_only_records]
).delete() # This only deletes records & doesnt fire the models "delete()" function which is what actually deletes actual image from the s3 bucket

if del_with_image:
for m in del_with_image:
m.delete() # will delete record and image from s3 bucket

media_after_attaching = remove_duplicates_and_attach_relations(hash)
return serialize(media_after_attaching, True), None

def summarize_duplicates(self, args, context: Context):
Expand All @@ -99,6 +83,15 @@ def summarize_duplicates(self, args, context: Context):
return response, None

def generate_hashes(self, args, context: Context):
"""
Goes over all media items in the database and generates hash values for them.
It saves the hash value to the media object after that.
If an image is corrupted or not valid in the bucket, the hash generation of that particular image will simply fail silently.
This route is simply meant to be run once, since there are lots of images in our system that dont have their hash values generated already.
All future image uploads will have their hash values generated automatically and saved to the media object.
Check the Media model for the modifications on the "save()" function.
"""
images = Media.objects.order_by("-id")
count = 0
for image in images:
Expand Down
6 changes: 5 additions & 1 deletion src/task_queue/database_tasks/media_library_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
find_duplicate_items,
get_admins_of_communities,
get_duplicate_count,
remove_duplicates_and_attach_relations,
summarize_duplicates_into_csv,
)
from api.utils.constants import MEDIA_LIBRARY_CLEANUP_TEMPLATE
Expand All @@ -29,7 +30,10 @@ def remove_duplicate_images():
num_of_dupes_in_all = get_duplicate_count(grouped_dupes)
csv_file = summarize_duplicates_into_csv(grouped_dupes)
admins = get_admins_of_communities(ids)


for hash_value in grouped_dupes.keys():
remove_duplicates_and_attach_relations(hash_value)

for admin in admins:
send_summary_email_to_admin(admin, community, num_of_dupes_in_all, csv_file)

Expand Down

0 comments on commit d553429

Please sign in to comment.