actual removal, now available in task

massenergize · Oct 19, 2023 · d553429 · d553429
1 parent 9638491
commit d553429
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 20 deletions.
diff --git a/src/api/store/common.py b/src/api/store/common.py
@@ -2,6 +2,7 @@
 import datetime
 from functools import reduce
 import io
+from typing import List
 from django.http import FileResponse
 from _main_.settings import AWS_S3_REGION_NAME, AWS_STORAGE_BUCKET_NAME
 from _main_.utils.common import serialize, serialize_all
@@ -530,4 +531,50 @@ def get_duplicate_count(grouped_dupes :dict):
     for duplicates in grouped_dupes.values(): 
         count +=len(duplicates) -1 # Subtracting 1 because one of them is the main one, and the remaining are duplicates
 
-    return count
+    return count
+
+def group_disposable(original: dict, disposable: List[int]):
+        """This function simply identifies which items in the disposable list share the same image as the original.
+        NB: because we initially did not dynamically label uploaded images (many many years ago.. lool) there are many scenarios where a user has  uploaded the same image multiple times and because the image was the same (with name & everything), it always replaced the existing record in the bucket, but under new media records in the database.
+
+        So many duplicate images(media records) still share the same image reference with the original media record. This is why items that share the same image reference as the chosen original need to be identified and grouped.
+        Deletion for such records will be treated differently than other duplicates where the images are the same as the original, but are completely
+        different uploads in terms of the reference in the s3 bucket. In such cases the images in the s3 bucket will be removed as well!
+        """
+
+        media = Media.objects.filter(pk=original.get("id")).first()
+        if not media:
+            return None, None
+        can_delete = Media.objects.filter(pk__in=disposable)
+        only_records = []
+        with_image = []
+        for m in can_delete:
+            if m.file.name == media.file.name:
+                only_records.append(m)
+            else:
+                with_image.append(m)
+
+        return only_records, with_image
+
+def remove_duplicates_and_attach_relations(hash): 
+    if not hash: 
+        return None
+    dupes = Media.objects.filter(hash=hash)
+    relations = resolve_relations(dupes)
+    media_after_attaching = attach_relations_to_media(relations)
+    disposables = relations.get("disposable", [])
+    disposables = [m.get("id", None) for m in disposables]
+    media = relations.get("media", {})
+    del_only_records, del_with_image = group_disposable(media, disposables)
+    # print("disposables", disposables)
+    if del_only_records:
+        Media.objects.filter(
+            pk__in=[m.id for m in del_only_records]
+        ).delete()  # This only deletes records & doesnt fire the models "delete()" function which is what actually deletes actual image from the s3 bucket
+
+    if del_with_image:
+        for m in del_with_image:
+            m.delete()  # will delete record and image from s3 bucket
+
+    return media_after_attaching
+
diff --git a/src/api/store/media_library.py b/src/api/store/media_library.py
@@ -4,7 +4,7 @@
 from database.utils.settings.model_constants.user_media_uploads import (
     UserMediaConstants,
 )
-from api.store.common import calculate_hash_for_bucket_item
+from api.store.common import calculate_hash_for_bucket_item, remove_duplicates_and_attach_relations
 from _main_.utils.common import serialize
 from _main_.utils.common import serialize, serialize_all
 from api.store.common import (
@@ -71,23 +71,7 @@ def group_disposable(self, original: dict, disposable: List[int]):
 
     def clean_duplicates(self, args, context: Context):
         hash = args.get("hash", None)
-        dupes = Media.objects.filter(hash=hash)
-        relations = resolve_relations(dupes)
-        media_after_attaching = attach_relations_to_media(relations)
-        disposables = relations.get("disposable", [])
-        disposables = [m.get("id", None) for m in disposables]
-        media = relations.get("media", {})
-        del_only_records, del_with_image = self.group_disposable(media, disposables)
-        # print("disposables", disposables)
-        if del_only_records:
-            Media.objects.filter(
-                pk__in=[m.id for m in del_only_records]
-            ).delete()  # This only deletes records & doesnt fire the models "delete()" function which is what actually deletes actual image from the s3 bucket
-
-        if del_with_image:
-            for m in del_with_image:
-                m.delete()  # will delete record and image from s3 bucket
-
+        media_after_attaching = remove_duplicates_and_attach_relations(hash)
         return serialize(media_after_attaching, True), None
 
     def summarize_duplicates(self, args, context: Context):
@@ -99,6 +83,15 @@ def summarize_duplicates(self, args, context: Context):
         return response, None
 
     def generate_hashes(self, args, context: Context):
+        """
+          Goes over all media items in the database and generates hash values for them. 
+          It saves the hash value to the media object after that. 
+          If an image is corrupted or not valid in the bucket, the hash generation of that particular image will simply fail silently. 
+
+          This route is simply meant to be run once, since there are lots of images in our system that dont have their hash values generated already. 
+          All future image uploads will have their hash values generated automatically and saved to the media object. 
+          Check the Media model for the modifications on the "save()" function.
+      """
         images = Media.objects.order_by("-id")
         count = 0
         for image in images:

diff --git a/src/task_queue/database_tasks/media_library_cleanup.py b/src/task_queue/database_tasks/media_library_cleanup.py
@@ -5,6 +5,7 @@
     find_duplicate_items,
     get_admins_of_communities,
     get_duplicate_count,
+    remove_duplicates_and_attach_relations,
     summarize_duplicates_into_csv,
 )
 from api.utils.constants import MEDIA_LIBRARY_CLEANUP_TEMPLATE
@@ -29,7 +30,10 @@ def remove_duplicate_images():
             num_of_dupes_in_all = get_duplicate_count(grouped_dupes)
             csv_file = summarize_duplicates_into_csv(grouped_dupes)
             admins = get_admins_of_communities(ids)
-
+
+            for hash_value in grouped_dupes.keys(): 
+                remove_duplicates_and_attach_relations(hash_value)
+
             for admin in admins:
                 send_summary_email_to_admin(admin, community, num_of_dupes_in_all, csv_file)