From d553429df388f22d6cacf87d2efcd89efe436c58 Mon Sep 17 00:00:00 2001 From: frimpongopoku Date: Thu, 19 Oct 2023 11:20:06 +0000 Subject: [PATCH] actual removal, now available in task --- src/api/store/common.py | 49 ++++++++++++++++++- src/api/store/media_library.py | 29 +++++------ .../database_tasks/media_library_cleanup.py | 6 ++- 3 files changed, 64 insertions(+), 20 deletions(-) diff --git a/src/api/store/common.py b/src/api/store/common.py index 7fde7c2a9..41460d02e 100644 --- a/src/api/store/common.py +++ b/src/api/store/common.py @@ -2,6 +2,7 @@ import datetime from functools import reduce import io +from typing import List from django.http import FileResponse from _main_.settings import AWS_S3_REGION_NAME, AWS_STORAGE_BUCKET_NAME from _main_.utils.common import serialize, serialize_all @@ -530,4 +531,50 @@ def get_duplicate_count(grouped_dupes :dict): for duplicates in grouped_dupes.values(): count +=len(duplicates) -1 # Subtracting 1 because one of them is the main one, and the remaining are duplicates - return count \ No newline at end of file + return count + +def group_disposable(original: dict, disposable: List[int]): + """This function simply identifies which items in the disposable list share the same image as the original. + NB: because we initially did not dynamically label uploaded images (many many years ago.. lool) there are many scenarios where a user has uploaded the same image multiple times and because the image was the same (with name & everything), it always replaced the existing record in the bucket, but under new media records in the database. + + So many duplicate images(media records) still share the same image reference with the original media record. This is why items that share the same image reference as the chosen original need to be identified and grouped. + Deletion for such records will be treated differently than other duplicates where the images are the same as the original, but are completely + different uploads in terms of the reference in the s3 bucket. In such cases the images in the s3 bucket will be removed as well! + """ + + media = Media.objects.filter(pk=original.get("id")).first() + if not media: + return None, None + can_delete = Media.objects.filter(pk__in=disposable) + only_records = [] + with_image = [] + for m in can_delete: + if m.file.name == media.file.name: + only_records.append(m) + else: + with_image.append(m) + + return only_records, with_image + +def remove_duplicates_and_attach_relations(hash): + if not hash: + return None + dupes = Media.objects.filter(hash=hash) + relations = resolve_relations(dupes) + media_after_attaching = attach_relations_to_media(relations) + disposables = relations.get("disposable", []) + disposables = [m.get("id", None) for m in disposables] + media = relations.get("media", {}) + del_only_records, del_with_image = group_disposable(media, disposables) + # print("disposables", disposables) + if del_only_records: + Media.objects.filter( + pk__in=[m.id for m in del_only_records] + ).delete() # This only deletes records & doesnt fire the models "delete()" function which is what actually deletes actual image from the s3 bucket + + if del_with_image: + for m in del_with_image: + m.delete() # will delete record and image from s3 bucket + + return media_after_attaching + diff --git a/src/api/store/media_library.py b/src/api/store/media_library.py index b42747d3d..bcda493e9 100644 --- a/src/api/store/media_library.py +++ b/src/api/store/media_library.py @@ -4,7 +4,7 @@ from database.utils.settings.model_constants.user_media_uploads import ( UserMediaConstants, ) -from api.store.common import calculate_hash_for_bucket_item +from api.store.common import calculate_hash_for_bucket_item, remove_duplicates_and_attach_relations from _main_.utils.common import serialize from _main_.utils.common import serialize, serialize_all from api.store.common import ( @@ -71,23 +71,7 @@ def group_disposable(self, original: dict, disposable: List[int]): def clean_duplicates(self, args, context: Context): hash = args.get("hash", None) - dupes = Media.objects.filter(hash=hash) - relations = resolve_relations(dupes) - media_after_attaching = attach_relations_to_media(relations) - disposables = relations.get("disposable", []) - disposables = [m.get("id", None) for m in disposables] - media = relations.get("media", {}) - del_only_records, del_with_image = self.group_disposable(media, disposables) - # print("disposables", disposables) - if del_only_records: - Media.objects.filter( - pk__in=[m.id for m in del_only_records] - ).delete() # This only deletes records & doesnt fire the models "delete()" function which is what actually deletes actual image from the s3 bucket - - if del_with_image: - for m in del_with_image: - m.delete() # will delete record and image from s3 bucket - + media_after_attaching = remove_duplicates_and_attach_relations(hash) return serialize(media_after_attaching, True), None def summarize_duplicates(self, args, context: Context): @@ -99,6 +83,15 @@ def summarize_duplicates(self, args, context: Context): return response, None def generate_hashes(self, args, context: Context): + """ + Goes over all media items in the database and generates hash values for them. + It saves the hash value to the media object after that. + If an image is corrupted or not valid in the bucket, the hash generation of that particular image will simply fail silently. + + This route is simply meant to be run once, since there are lots of images in our system that dont have their hash values generated already. + All future image uploads will have their hash values generated automatically and saved to the media object. + Check the Media model for the modifications on the "save()" function. + """ images = Media.objects.order_by("-id") count = 0 for image in images: diff --git a/src/task_queue/database_tasks/media_library_cleanup.py b/src/task_queue/database_tasks/media_library_cleanup.py index 9141ea250..59b76b023 100644 --- a/src/task_queue/database_tasks/media_library_cleanup.py +++ b/src/task_queue/database_tasks/media_library_cleanup.py @@ -5,6 +5,7 @@ find_duplicate_items, get_admins_of_communities, get_duplicate_count, + remove_duplicates_and_attach_relations, summarize_duplicates_into_csv, ) from api.utils.constants import MEDIA_LIBRARY_CLEANUP_TEMPLATE @@ -29,7 +30,10 @@ def remove_duplicate_images(): num_of_dupes_in_all = get_duplicate_count(grouped_dupes) csv_file = summarize_duplicates_into_csv(grouped_dupes) admins = get_admins_of_communities(ids) - + + for hash_value in grouped_dupes.keys(): + remove_duplicates_and_attach_relations(hash_value) + for admin in admins: send_summary_email_to_admin(admin, community, num_of_dupes_in_all, csv_file)