-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #249 from MetaCell/feature/SCKAN-274
Feature/sckan 274
- Loading branch information
Showing
9 changed files
with
33,915 additions
and
33,711 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
123 changes: 75 additions & 48 deletions
123
backend/composer/management/commands/ingest_anatomical_entities.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,64 +1,91 @@ | ||
import csv | ||
|
||
from django.core.management.base import BaseCommand | ||
|
||
from composer.models import AnatomicalEntity | ||
import time | ||
from django.core.management.base import BaseCommand, CommandError | ||
from django.db import transaction | ||
from django.db.utils import IntegrityError | ||
from composer.models import AnatomicalEntity, Synonym | ||
|
||
URI = "o" | ||
NAME = "o_label" | ||
SYNONYM = "o_synonym" | ||
BULK_LIMIT = 100 | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Ingests Anatomical Entities CSV file(s)" | ||
help = "Ingests Anatomical Entities CSV file(s)." | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument("csv_files", nargs="+", type=str) | ||
parser.add_argument("--show_complete_logs", action='store_true', | ||
help="Show detailed logs during processing") | ||
|
||
def _process_anatomical_entity(self, name, ontology_uri, synonym, show_complete_logs, processed_uris, | ||
unique_synonyms): | ||
try: | ||
is_first_occurrence = ontology_uri not in processed_uris | ||
|
||
def _create_ae(self, name, ontology_uri): | ||
found = AnatomicalEntity.objects.filter(name__iexact=name).exists() | ||
if not found: | ||
return AnatomicalEntity( | ||
name=name, | ||
ontology_uri=ontology_uri | ||
anatomical_entity, created = AnatomicalEntity.objects.get_or_create( | ||
ontology_uri=ontology_uri, | ||
defaults={"name": name}, | ||
) | ||
return None | ||
# anatomical_entity, created = AnatomicalEntity.objects.get_or_create( | ||
# name__iexact=name, | ||
# defaults={"ontology_uri": ontology_uri, "name": name}, | ||
# ) | ||
# if created: | ||
# self.stdout.write(f"Anatomical Entity {name} created.") | ||
# anatomical_entity.save() | ||
if not created and is_first_occurrence: | ||
if anatomical_entity.name != name: | ||
anatomical_entity.name = name | ||
anatomical_entity.save() | ||
if show_complete_logs: | ||
self.stdout.write( | ||
self.style.SUCCESS(f"Updated {anatomical_entity.ontology_uri} name to {name}.") | ||
) | ||
|
||
processed_uris.add(ontology_uri) | ||
|
||
synonym_key = (ontology_uri, synonym.lower()) if synonym else None | ||
if synonym and synonym_key not in unique_synonyms: | ||
if not Synonym.objects.filter(anatomical_entity=anatomical_entity, name__iexact=synonym).exists(): | ||
unique_synonyms[synonym_key] = Synonym(anatomical_entity=anatomical_entity, name=synonym) | ||
if show_complete_logs: | ||
self.stdout.write( | ||
self.style.SUCCESS(f"Synonym '{synonym}' added for {anatomical_entity.ontology_uri}.")) | ||
except IntegrityError as e: | ||
self.stdout.write(self.style.ERROR(f"Error processing {ontology_uri}: {e}")) | ||
|
||
@transaction.atomic | ||
def handle(self, *args, **options): | ||
start_time = time.time() | ||
show_complete_logs = options['show_complete_logs'] | ||
unique_synonyms = {} | ||
processed_uris = set() | ||
|
||
for csv_file in options["csv_files"]: | ||
with open( | ||
csv_file, newline="", encoding="utf-8", errors="ignore" | ||
) as csvfile: | ||
aereader = csv.DictReader( | ||
csvfile, | ||
delimiter=";", | ||
quotechar='"', | ||
) | ||
bulk = [] | ||
self.stdout.write("Start ingestion of Anatomical Entities") | ||
for row in aereader: | ||
ontology_uri = row[URI] | ||
name = row[NAME] | ||
synonym = row[SYNONYM] or None | ||
ae = self._create_ae(name, ontology_uri) | ||
if ae: | ||
bulk.append(ae) | ||
if synonym: | ||
ae = self._create_ae(synonym, ontology_uri) | ||
if ae: | ||
bulk.append(ae) | ||
if len(bulk) > 100: | ||
self.stdout.write(f"{len(bulk)} new Anatomical Entities created.") | ||
AnatomicalEntity.objects.bulk_create(bulk, ignore_conflicts=True) | ||
bulk = [] | ||
if len(bulk) > 0: | ||
# insert the remaining | ||
self.stdout.write(f"{len(bulk)} new Anatomical Entities created.") | ||
AnatomicalEntity.objects.bulk_create(bulk, ignore_conflicts=True) | ||
try: | ||
with open(csv_file, newline="", encoding="utf-8", errors="ignore") as csvfile: | ||
reader = csv.DictReader(csvfile, delimiter=",", quotechar='"') | ||
for current_line, row in enumerate(reader, start=1): | ||
if current_line % 100 == 0: | ||
self.stdout.write(self.style.NOTICE(f"Processing line {current_line}")) | ||
|
||
ontology_uri = row[URI].strip() | ||
name = row[NAME].strip() | ||
synonym = row[SYNONYM].strip() if row[SYNONYM] else None | ||
|
||
self._process_anatomical_entity(name, ontology_uri, synonym, show_complete_logs, processed_uris, | ||
unique_synonyms) | ||
|
||
if len(unique_synonyms) >= BULK_LIMIT: | ||
Synonym.objects.bulk_create(unique_synonyms.values(), ignore_conflicts=True) | ||
unique_synonyms.clear() | ||
|
||
except FileNotFoundError: | ||
self.stdout.write(self.style.ERROR(f"File {csv_file} does not exist.")) | ||
except Exception as e: | ||
self.stdout.write(self.style.ERROR(f"An error occurred while processing {csv_file}: {e}")) | ||
|
||
# Ensure any remaining synonyms are created | ||
if unique_synonyms: | ||
try: | ||
Synonym.objects.bulk_create(unique_synonyms.values(), ignore_conflicts=True) | ||
except Exception as e: | ||
self.stdout.write(self.style.ERROR(f"An error occurred during bulk creation: {e}")) | ||
|
||
end_time = time.time() | ||
self.stdout.write(self.style.SUCCESS(f"Operation completed in {end_time - start_time:.2f} seconds.")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Generated by Django 4.1.4 on 2024-03-13 17:15 | ||
|
||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
dependencies = [ | ||
("composer", "0040_auto_20240213_1301"), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name="Synonym", | ||
fields=[ | ||
( | ||
"id", | ||
models.BigAutoField( | ||
auto_created=True, | ||
primary_key=True, | ||
serialize=False, | ||
verbose_name="ID", | ||
), | ||
), | ||
("name", models.CharField(db_index=True, max_length=200)), | ||
( | ||
"anatomical_entity", | ||
models.ForeignKey( | ||
on_delete=django.db.models.deletion.CASCADE, | ||
related_name="synonyms", | ||
to="composer.anatomicalentity", | ||
), | ||
), | ||
], | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# Generated by Django 4.1.4 on 2024-03-13 17:18 | ||
|
||
from django.db import migrations | ||
|
||
|
||
def deduplicate_anatomical_entities(apps, schema_editor): | ||
AnatomicalEntity = apps.get_model('composer', 'AnatomicalEntity') | ||
Synonym = apps.get_model('composer', 'Synonym') | ||
ConnectivityStatement = apps.get_model('composer', 'ConnectivityStatement') | ||
Destination = apps.get_model('composer', 'Destination') | ||
Via = apps.get_model('composer', 'Via') | ||
|
||
primary_anatomical_entities = {} | ||
|
||
for entity in AnatomicalEntity.objects.all().order_by('id'): | ||
if entity.ontology_uri in primary_anatomical_entities: | ||
# If the ontology_uri is a duplicate, move this entity to Synonym | ||
primary_entity = AnatomicalEntity.objects.get(id=primary_anatomical_entities[entity.ontology_uri]) | ||
# Create a synonym for the duplicate entity | ||
Synonym.objects.create(anatomical_entity=primary_entity, name=entity.name) | ||
|
||
# Update ConnectivityStatement origins to point to the primary entity | ||
for cs in ConnectivityStatement.objects.filter(origins=entity): | ||
cs.origins.remove(entity) | ||
cs.origins.add(primary_entity) | ||
|
||
# Update Destination and Via for anatomical_entities and from_entities | ||
for destination in Destination.objects.filter(anatomical_entities=entity): | ||
destination.anatomical_entities.remove(entity) | ||
destination.anatomical_entities.add(primary_entity) | ||
|
||
for destination in Destination.objects.filter(from_entities=entity): | ||
destination.from_entities.remove(entity) | ||
destination.from_entities.add(primary_entity) | ||
|
||
for via in Via.objects.filter(anatomical_entities=entity): | ||
via.anatomical_entities.remove(entity) | ||
via.anatomical_entities.add(primary_entity) | ||
|
||
for via in Via.objects.filter(from_entities=entity): | ||
via.from_entities.remove(entity) | ||
via.from_entities.add(primary_entity) | ||
|
||
# Finally, delete the duplicate entity | ||
entity.delete() | ||
|
||
else: | ||
primary_anatomical_entities[entity.ontology_uri] = entity.id | ||
|
||
|
||
class Migration(migrations.Migration): | ||
dependencies = [ | ||
("composer", "0041_synonym"), | ||
] | ||
|
||
operations = [ | ||
migrations.RunPython(deduplicate_anatomical_entities), | ||
] |
17 changes: 17 additions & 0 deletions
17
backend/composer/migrations/0043_alter_anatomicalentity_ontology_uri.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Generated by Django 4.1.4 on 2024-03-13 17:30 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
dependencies = [ | ||
("composer", "0042_auto_20240313_1718"), | ||
] | ||
|
||
operations = [ | ||
migrations.AlterField( | ||
model_name="anatomicalentity", | ||
name="ontology_uri", | ||
field=models.URLField(unique=True), | ||
), | ||
] |
16 changes: 16 additions & 0 deletions
16
backend/composer/migrations/0044_remove_anatomicalentity_ae_unique_upper_name.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Generated by Django 4.1.4 on 2024-03-13 18:49 | ||
|
||
from django.db import migrations | ||
|
||
|
||
class Migration(migrations.Migration): | ||
dependencies = [ | ||
("composer", "0043_alter_anatomicalentity_ontology_uri"), | ||
] | ||
|
||
operations = [ | ||
migrations.RemoveConstraint( | ||
model_name="anatomicalentity", | ||
name="ae_unique_upper_name", | ||
), | ||
] |
16 changes: 16 additions & 0 deletions
16
backend/composer/migrations/0045_alter_synonym_unique_together.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Generated by Django 4.1.4 on 2024-03-18 17:57 | ||
|
||
from django.db import migrations | ||
|
||
|
||
class Migration(migrations.Migration): | ||
dependencies = [ | ||
("composer", "0044_remove_anatomicalentity_ae_unique_upper_name"), | ||
] | ||
|
||
operations = [ | ||
migrations.AlterUniqueTogether( | ||
name="synonym", | ||
unique_together={("anatomical_entity", "name")}, | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.