Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find and remove image duplicates #802

Merged
merged 22 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ced8aa9
ignore venv
frimpongopoku Sep 15, 2023
6ffe85c
media model will calc hash before save. Working on route to hash old …
frimpongopoku Sep 15, 2023
d116ab3
big steps towards grouping and detecting dupes, need to test now
frimpongopoku Sep 18, 2023
5ed5748
dupe summary works nicely :fire:
frimpongopoku Sep 19, 2023
9d5b83f
code is leaner and reusable, and being used in "clean"
frimpongopoku Sep 19, 2023
ddfe091
cleanup
frimpongopoku Sep 19, 2023
7c256b8
reattaching to only one media obj works too
frimpongopoku Sep 20, 2023
03e6658
download summary as csv works now :fire:
frimpongopoku Sep 20, 2023
3fa3a1a
actual deletion now when F.E hits ".clean"
frimpongopoku Sep 21, 2023
42de965
disposable groupings to determine when to delete only records and whe…
frimpongopoku Sep 22, 2023
a89d48d
broken more items into pieces and separate fxns, new task, FF based
frimpongopoku Oct 16, 2023
9638491
media library routine cleanup now turned into a task & FF based
frimpongopoku Oct 19, 2023
d553429
actual removal, now available in task
frimpongopoku Oct 19, 2023
d1905d9
light mods and restructuring
frimpongopoku Oct 20, 2023
9b23471
merged migration conflict
frimpongopoku Oct 20, 2023
4c6be29
only one email summary for all will be sent now after the task
frimpongopoku Oct 20, 2023
8803028
communities already come as ids
frimpongopoku Oct 20, 2023
d8874d4
now uses task object passed by the routine itself
frimpongopoku Oct 20, 2023
84d80a8
fixed blow ups :rofl:
frimpongopoku Oct 20, 2023
38587e4
streamlining hash calculation and many other tiny restructuring
frimpongopoku Oct 23, 2023
aae5719
Clean-up - remake migrations
BradHN1 Nov 3, 2023
d3cce39
not IS_LOCAL
BradHN1 Nov 3, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ deployment/aws/
test_data/
celerybeat-schedule.db
*.rdb*
venv
*.venv
venv
11 changes: 10 additions & 1 deletion src/_main_/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,24 +121,33 @@ def rename_fields(args, pairs):

def serialize_all(data, full=False, **kwargs):
#medium = (kwargs or {}).get("medium", False)
info = (kwargs or {}).get("info", False)
if not data:
return []

if isinstance(data[0], dict):
return data


if full:
return [d.full_json() for d in data]
elif info:
return [d.info() for d in data]
frimpongopoku marked this conversation as resolved.
Show resolved Hide resolved
#elif medium:
# return [d.medium_json() for d in data]
return [d.simple_json() for d in data]


def serialize(data, full=False):
def serialize(data, full=False, **kwargs):
info = (kwargs or {}).get("info", False)
if not data:
return {}

if full:
return data.full_json()
elif info:
return data.info()

return data.simple_json()

def check_length(args, field, min_length=5, max_length=40):
Expand Down
11 changes: 10 additions & 1 deletion src/api/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,13 @@
STANDARD_USER = 'standard_user'
GUEST_USER = 'guest_user'
INVITED_USER = 'invited_user'
WHEN_USER_AUTHENTICATED_SESSION_EXPIRES = "WHEN_USER_AUTHENTICATED_SESSION_EXPIRES"
WHEN_USER_AUTHENTICATED_SESSION_EXPIRES = "WHEN_USER_AUTHENTICATED_SESSION_EXPIRES"

CSV_FIELD_NAMES = [
"media_url",
"primary_media_id",
"usage_stats",
"usage_summary",
"ids_of_duplicates",
"duplicates",
]
62 changes: 62 additions & 0 deletions src/api/handlers/media_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,68 @@ def registerRoutes(self):
self.add("/gallery.find", self.find_images)
self.add("/gallery.item.edit", self.edit_details)

self.add("/gallery.generate.hashes", self.generate_hashes) # A temporary route that we will need to run to generate hashes of already uploaded content (ONCE!)
self.add("/gallery.duplicates.summarize", self.summarize_duplicates) # Generates a CSV of duplicate images with other useful attributes
self.add("/gallery.duplicates.clean", self.clean_duplicates) # Allows you to clean all/some duplicates and transfer relationships to only one record
self.add("/gallery.duplicates.summary.print", self.print_duplicates) # Allows you to clean all/some duplicates and transfer relationships to only one record

# @admins_only
def print_duplicates(self, request):
frimpongopoku marked this conversation as resolved.
Show resolved Hide resolved
""" Creates a downloadable file that contains the summary of duplicate media"""
context: Context = request.context
args: dict = context.args
self.validator.expect("type", str) # Future Enhancement: provide type as 'csv' or 'pdf' or other formats. But currently, only CSV
args, err = self.validator.verify(args, strict=True)
if err:
return err
response, error = self.service.print_duplicates(args, context)
if error:
return error
# return MassenergizeResponse(data=images)
return response

# @admins_only
def clean_duplicates(self, request):
"""Based on requests params, this route can remove duplicates and re-assign relationships for a specific group of similar duplicate items, or do so for all groups """
context: Context = request.context
args: dict = context.args
self.validator.expect("hash", str, is_required=True)
args, err = self.validator.verify(args, strict=True)
if err:
return err
images, error = self.service.clean_duplicates(args, context)
if error:
return error
return MassenergizeResponse(data=images)


def summarize_duplicates(self, request):
"""Creates a summary of duplicate images and a combined list of wherever they are being used on the platform"""
context: Context = request.context
args: dict = context.args
args, err = self.validator.verify(args, strict=True)
if err:
return err
images, error = self.service.summarize_duplicates(args, context)
if error:
return error
return MassenergizeResponse(data=images)

# @admins_only
def generate_hashes(self, request):
"""Generates hashes of media images in the database that dont have hashes yet"""
context: Context = request.context
args: dict = context.args

args, err = self.validator.verify(args, strict=True)
if err:
return err
images, error = self.service.generate_hashes(args, context)
if error:
return error
return MassenergizeResponse(data=images)


@admins_only
def fetch_content(self, request):
"""Fetches image content related communities that admins can browse through"""
Expand Down
27 changes: 27 additions & 0 deletions src/api/services/media_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,33 @@ class MediaLibraryService:
def __init__(self):
self.store = MediaLibraryStore()

def print_duplicates(self, args, context):
response, error = self.store.print_duplicates(args, context)
if error:
return None, error
# return serialize_all(response), None
return response, None

def clean_duplicates(self, args, context):
response, error = self.store.clean_duplicates(args, context)
if error:
return None, error
# return serialize_all(response), None
return response, None

def summarize_duplicates(self, args, context):
response, error = self.store.summarize_duplicates(args, context)
if error:
return None, error
# return serialize_all(response), None
return response, None

def generate_hashes(self, args, context):
response, error = self.store.generate_hashes(args, context)
if error:
return None, error
return response, None

def fetch_content(self, args):
images, error = self.store.fetch_content(args)
if error:
Expand Down
Loading
Loading