From 28e2ffc80c58631d4a2afca41251ace207113e49 Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Thu, 6 Jun 2024 00:42:06 +0100 Subject: [PATCH 1/3] CU-2e77a98: Model pack django ORM model, admin functionality. A new ModelPack unpacks, test loads the CDB, Vocab, MetaTask models and loads their definitions into Django models if they don't already exist --- webapp/api/api/admin/__init__.py | 2 + webapp/api/api/admin/models.py | 15 ++ webapp/api/api/apps.py | 1 + .../migrations/0078_metacatmodel_modelpack.py | 34 +++++ webapp/api/api/models.py | 139 +++++++++++++++++- webapp/api/api/serializers.py | 12 ++ webapp/api/api/{forms.py => signals.py} | 25 +++- webapp/api/api/utils.py | 4 + 8 files changed, 227 insertions(+), 5 deletions(-) create mode 100644 webapp/api/api/migrations/0078_metacatmodel_modelpack.py rename webapp/api/api/{forms.py => signals.py} (57%) diff --git a/webapp/api/api/admin/__init__.py b/webapp/api/api/admin/__init__.py index 60727e47..c7036e94 100644 --- a/webapp/api/api/admin/__init__.py +++ b/webapp/api/api/admin/__init__.py @@ -17,3 +17,5 @@ admin.site.register(ExportedProject, ExportedProjectAdmin) admin.site.register(ProjectMetrics, ProjectMetricsAdmin) admin.site.register(Dataset, DatasetAdmin) +admin.site.register(ModelPack, ModelPackAdmin) +admin.site.register(MetaCATModel, MetaCATModelAdmin) diff --git a/webapp/api/api/admin/models.py b/webapp/api/api/admin/models.py index 76f7c8ad..ed09f378 100644 --- a/webapp/api/api/admin/models.py +++ b/webapp/api/api/admin/models.py @@ -153,6 +153,21 @@ class ConceptDBAdmin(admin.ModelAdmin): actions = [import_concepts, delete_indexed_concepts, reset_cdb_filters] +class ModelPackAdmin(admin.ModelAdmin): + model = ModelPack + list_display = ('name', 'model_pack', 'concept_db', 'vocab', 'metacats') + fields = ['name', 'model_pack'] + + def metacats(self, obj): + return ", ".join(str(m_c) for m_c in obj.meta_cats.all()) + + +class MetaCATModelAdmin(admin.ModelAdmin): + model = MetaCATModel + list_display = ('name', 'meta_cat_dir') + list_filter = ['meta_task'] + + class DocumentAdmin(admin.ModelAdmin): model = Document actions = [remove_all_documents] diff --git a/webapp/api/api/apps.py b/webapp/api/api/apps.py index f49ca144..099b9750 100644 --- a/webapp/api/api/apps.py +++ b/webapp/api/api/apps.py @@ -12,6 +12,7 @@ class ApiConfig(AppConfig): def ready(self): from api.views import _submit_document from api.models import ProjectAnnotateEntities + from . import signals resubmit_all = os.environ.get('RESUBMIT_ALL_ON_STARTUP', None) if resubmit_all is not None and resubmit_all.lower() in ('1', 'y', 'true'): logger.info('Found env var RESUBMIT_ALL_ON_STARTUP is True. ' diff --git a/webapp/api/api/migrations/0078_metacatmodel_modelpack.py b/webapp/api/api/migrations/0078_metacatmodel_modelpack.py new file mode 100644 index 00000000..5f52fbf0 --- /dev/null +++ b/webapp/api/api/migrations/0078_metacatmodel_modelpack.py @@ -0,0 +1,34 @@ +# Generated by Django 2.2.28 on 2024-06-05 23:39 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0077_projectgroup_create_associated_projects'), + ] + + operations = [ + migrations.CreateModel( + name='MetaCATModel', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=100)), + ('meta_cat_dir', models.FilePathField(allow_folders=True, help_text='The zip or dir for a MetaCAT model')), + ('meta_task', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.MetaTask')), + ], + ), + migrations.CreateModel( + name='ModelPack', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.TextField()), + ('model_pack', models.FileField(help_text='Model pack zip', upload_to='')), + ('concept_db', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='api.ConceptDB')), + ('meta_cats', models.ManyToManyField(blank=True, default=None, to='api.MetaCATModel')), + ('vocab', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='api.Vocabulary')), + ], + ), + ] diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py index 96f385af..f505efa2 100644 --- a/webapp/api/api/models.py +++ b/webapp/api/api/models.py @@ -1,14 +1,24 @@ +import logging import os +import shutil +from zipfile import BadZipFile import pandas as pd from django.conf import settings from django.core.exceptions import ValidationError from django.core.validators import RegexValidator from django.db import models +from django.db.models import DO_NOTHING, SET_NULL from django.dispatch import receiver from django.forms import forms, ModelForm +from medcat.cat import CAT +from medcat.cdb import CDB +from medcat.vocab import Vocab +from medcat.meta_cat import MetaCAT from polymorphic.models import PolymorphicModel +from core.settings import MEDIA_ROOT + STATUS_CHOICES = [ (0, 'Not Validated'), (1, 'Validated'), @@ -22,6 +32,65 @@ cdb_name_validator = RegexValidator(r'^[0-9a-zA-Z_-]*$', 'Only alpahanumeric characters, -, _ are allowed for CDB names') +logger = logging.getLogger(__name__) + + +class ModelPack(models.Model): + name = models.TextField(help_text='') + model_pack = models.FileField(help_text='Model pack zip') + concept_db = models.ForeignKey('ConceptDB', on_delete=models.CASCADE, blank=True, null=True) + vocab = models.ForeignKey('Vocabulary', on_delete=models.CASCADE, blank=True, null=True) + meta_cats = models.ManyToManyField('MetaCATModel', blank=True, default=None) + + def save(self, *args, **kwargs): + super().save(*args, **kwargs) + logger.info('Loading model pack: %s', self.model_pack) + model_pack_name = str(self.model_pack).replace(".zip", "") + try: + CAT.attempt_unpack(self.model_pack.path) + except BadZipFile as exc: + # potential for CRC-32 errors in Trainer process - ignore and still use + logger.warning(f'Possibly corrupt cdb.dat decompressing {self.model_pack}\nFull Exception: {exc}') + unpacked_model_pack_path = self.model_pack.path.replace('.zip', '') + # attempt to load cdb + try: + CAT.load_cdb(unpacked_model_pack_path) + concept_db = ConceptDB() + unpacked_file_name = self.model_pack.file.name.replace('.zip', '') + concept_db.cdb_file.name = os.path.join(unpacked_file_name, 'cdb.dat') + concept_db.name = f'{self.name} - CDB' + concept_db.save(skip_load=True) + self.concept_db = concept_db + except Exception as exc: + raise FileNotFoundError(f'Error loading the CDB from this model pack: {self.model_pack.path}') from exc + + # Load Vocab + vocab_path = os.path.join(unpacked_model_pack_path, "vocab.dat") + if os.path.exists(vocab_path): + Vocab.load(vocab_path) + vocab = Vocabulary() + vocab.vocab_file.name = vocab_path.replace(f'{MEDIA_ROOT}/', '') + vocab.save(skip_load=True) + self.vocab = vocab + else: + raise FileNotFoundError(f'Error loading the Vocab from this model pack: {vocab_path}') + + # load MetaCATs + try: + # should raise an error if there already is a MetaCAT model with this definition + for meta_cat_dir, meta_cat in CAT.load_meta_cats(unpacked_model_pack_path): + mc_model = MetaCATModel() + mc_model.meta_cat_dir = meta_cat_dir.replace(f'{MEDIA_ROOT}/', '') + mc_model.name = f'{meta_cat.config.general.category_name} - {meta_cat.config.model.model_name}' + mc_model.save(unpack_load_meta_cat_dir=False) + mc_model.get_or_create_meta_tasks_and_values(meta_cat) + except Exception as exc: + raise MedCATLoadException(f'Failure loading MetaCAT models - {unpacked_model_pack_path}') from exc + super().save(*args, **kwargs) + + def __str__(self): + return self.name + class ConceptDB(models.Model): name = models.CharField(max_length=100, default='', blank=True, validators=[cdb_name_validator]) @@ -29,20 +98,27 @@ class ConceptDB(models.Model): use_for_training = models.BooleanField(default=True) def __init__(self, *args, **kwargs): - super(ConceptDB, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.__cdb_field_name = None @classmethod def from_db(cls, db, field_names, values): - inst = super(ConceptDB, cls).from_db(db, field_names, values) + inst = super().from_db(db, field_names, values) inst.__cdb_field_name = [v for f, v in zip(field_names, values) if f == 'cdb_file'][0] return inst - def save(self, *args, **kwargs): + def save(self, *args, skip_load=False, **kwargs, ): + # load the CDB, and raise if this fails. + if not skip_load: + try: + CDB.load(self.cdb_file) + except Exception as exc: + raise MedCATLoadException(f'Failed to load Concept DB from {self.cdb_file}, ' + f'check if this CDB file successfully loads elsewhere') from exc if self.__cdb_field_name is not None and self.__cdb_field_name != self.cdb_file.name: raise ValidationError('Cannot change file path of existing CDB.') else: - super(ConceptDB, self).save(*args, **kwargs) + super().save(*args, **kwargs) def __str__(self): return self.name @@ -51,10 +127,60 @@ def __str__(self): class Vocabulary(models.Model): vocab_file = models.FileField() + def save(self, *args, skip_load=False, **kwargs): + # load the Vocab, and raise if this fails + if not skip_load: + try: + Vocab.load(self.vocab_file) + except Exception as exc: + raise MedCATLoadException(f'Failed to load Vocab from {self.vocab_file}, ' + f'check if this Vocab file successfully loads elsewhere') from exc + super().save(*args, **kwargs) + def __str__(self): return str(self.vocab_file.name) +class MetaCATModel(models.Model): + name = models.CharField(max_length=100) + meta_cat_dir = models.FilePathField(help_text='The zip or dir for a MetaCAT model', allow_folders=True) + meta_task = models.ForeignKey('MetaTask', on_delete=SET_NULL, blank=True, null=True) + + def get_or_create_meta_tasks_and_values(self, meta_cat: MetaCAT): + task = meta_cat.config.general.category_name + mt = MetaTask.objects.filter(name=task).first() + if not mt: + mt = MetaTask() + mt.name = task + mt.save() + self.meta_task = mt + + mt_vs = [] + for meta_task_value in meta_cat.config.general.category_value2id.keys(): + mt_v = MetaTaskValue.objects.filter(name=meta_task_value).first() + if not mt_v: + mt_v = MetaTaskValue() + mt_v.name = meta_task_value + mt_v.save() + mt_vs.append(mt_v) + self.meta_task.values.set(mt_vs) + + def save(self, *args, unpack_load_meta_cat_dir=False, **kwargs): + if unpack_load_meta_cat_dir: + try: + # load the meta cat model, raise if issues + model_files = os.path.join(MEDIA_ROOT, self.meta_cat_dir) + shutil.unpack_archive(self.meta_cat_dir, extract_dir=model_files) + MetaCAT.load(save_dir_path=model_files) + except Exception as exc: + raise MedCATLoadException(f'Failed to load MetaCAT from {self.meta_cat_dir}, ' + f'check if this MetaCAT dir successfully loads elsewhere') from exc + super().save(*args, **kwargs) + + def __str__(self): + return f'{self.name} - {str(self.meta_cat_dir)}' + + class Dataset(models.Model): name = models.CharField(max_length=150) original_file = models.FileField() @@ -355,3 +481,8 @@ def _remove_file(instance, prop): if getattr(instance, prop): if os.path.isfile(getattr(instance, prop).path): os.remove(getattr(instance, prop).path) + + +class MedCATLoadException(Exception): + def __init__(self, message): + super().__init__(message) diff --git a/webapp/api/api/serializers.py b/webapp/api/api/serializers.py index 9be1430f..2f6b58cb 100644 --- a/webapp/api/api/serializers.py +++ b/webapp/api/api/serializers.py @@ -35,6 +35,18 @@ class Meta: fields = '__all__' +class ModelPackSerializer(serializers.ModelSerializer): + class Meta: + model = ModelPack + fields = '__all__' + + +class MetaCATModelSerializer(serializers.ModelSerializer): + class Meta: + model = MetaCATModel + fields = '__all__' + + class ConceptDBSerializer(serializers.ModelSerializer): class Meta: model = ConceptDB diff --git a/webapp/api/api/forms.py b/webapp/api/api/signals.py similarity index 57% rename from webapp/api/api/forms.py rename to webapp/api/api/signals.py index af445a6a..a2f488bd 100644 --- a/webapp/api/api/forms.py +++ b/webapp/api/api/signals.py @@ -1,12 +1,18 @@ import json +import logging import os +import shutil from django.db.models.fields.files import FileField from django.db.models.signals import post_save, post_delete, pre_save from django.dispatch import receiver from api.data_utils import dataset_from_file, delete_orphan_docs, upload_projects_export -from api.models import Dataset, ExportedProject +from api.models import Dataset, ExportedProject, ModelPack +from core.settings import MEDIA_ROOT + + +logger = logging.getLogger(__name__) @receiver(post_save, sender=Dataset) @@ -36,3 +42,20 @@ def save_exported_projects(sender, instance, **kwargs): if not instance.trainer_export_file.path.endswith('.json'): raise Exception("Please make sure the file is a .json file") upload_projects_export(json.load(open(instance.trainer_export_file.path))) + + +@receiver(post_delete, sender=ModelPack) +def remove_model_pack_assets(sender, instance, **kwargs): + if instance.concept_db: + instance.concept_db.delete(using=None, keep_parents=False) + if instance.vocab: + instance.vocab.delete(using=None, keep_parents=False) + if len(instance.meta_cats.all()) > 0: + for m_c in instance.meta_cats.all(): + m_c.delete(using=None, keep_parents=False) + try: + # rm the model pack unzipped dir & model pack zip + shutil.rmtree(instance.model_pack.path.replace(".zip", "")) + os.remove(instance.model_pack.path) + except FileNotFoundError: + logger.warning("Failure removing Model pack dir or zip. Not found. Likely already deleted") diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index 6cadf671..efc0d75e 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -262,6 +262,10 @@ def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab + # integrated model-pack spacy model not used. + # This assumes specified spacy model is installed... + # Next change will create conditional params to load CDB / Vocab, or + # model-packs directly for a project. cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat From d24e9b3d49433caeef02e87090d3982bb6932dd3 Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Fri, 7 Jun 2024 11:47:13 +0100 Subject: [PATCH 2/3] CU-2e77a98: Increase nginx conf for v large models. Use latest medcat install, needs to be changed to 1.12 release once available --- nginx/nginx.conf | 4 ++-- nginx/sites-enabled/medcattrainer | 1 - webapp/requirements.txt | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/nginx/nginx.conf b/nginx/nginx.conf index 478ac573..1c8256a4 100644 --- a/nginx/nginx.conf +++ b/nginx/nginx.conf @@ -57,8 +57,8 @@ http { proxy_read_timeout 3000; send_timeout 3000; - #client body size to 6M # - client_max_body_size 3000M; + # increase client body size - Model packs can be over 3G.s + client_max_body_size 4000M; # Speed up file transfers by using sendfile() to copy directly # between descriptors rather than using read()/write(). # For performance reasons, on FreeBSD systems w/ ZFS diff --git a/nginx/sites-enabled/medcattrainer b/nginx/sites-enabled/medcattrainer index 5ebec591..88b4c817 100644 --- a/nginx/sites-enabled/medcattrainer +++ b/nginx/sites-enabled/medcattrainer @@ -3,7 +3,6 @@ server { server_name localhost; charset utf-8; large_client_header_buffers 4 32k; - client_max_body_size 10G; location /static { alias /home/api/static; diff --git a/webapp/requirements.txt b/webapp/requirements.txt index 6fb26e5f..789d95fc 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic~=3.0 djangorestframework~=3.10 django-background-tasks~=1.2 openpyxl~=3.0 -medcat~=1.10 +medcat @ git+https://github.com/CogStack/MedCAT@03c68817f4db25cd867ae118ff8dcca12151cf90 From ccb817716efc26b1248724f823d22755050bcb81 Mon Sep 17 00:00:00 2001 From: tomolopolis Date: Wed, 19 Jun 2024 17:01:24 +0100 Subject: [PATCH 3/3] CU-8694gtyem: Update medcat to 1.12 --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index 789d95fc..221fe490 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic~=3.0 djangorestframework~=3.10 django-background-tasks~=1.2 openpyxl~=3.0 -medcat @ git+https://github.com/CogStack/MedCAT@03c68817f4db25cd867ae118ff8dcca12151cf90 +medcat~=1.12