Skip to content

Commit

Permalink
Merge pull request #192 from CogStack/use-medcat-model-packs
Browse files Browse the repository at this point in the history
CU-2e77a98: Model pack django ORM model, admin functionality.
  • Loading branch information
tomolopolis authored Jul 1, 2024
2 parents 555f244 + ccb8177 commit 062b509
Show file tree
Hide file tree
Showing 10 changed files with 229 additions and 8 deletions.
4 changes: 2 additions & 2 deletions nginx/nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ http {
proxy_read_timeout 3000;
send_timeout 3000;

#client body size to 6M #
client_max_body_size 3000M;
# increase client body size - Model packs can be over 3G.s
client_max_body_size 4000M;
# Speed up file transfers by using sendfile() to copy directly
# between descriptors rather than using read()/write().
# For performance reasons, on FreeBSD systems w/ ZFS
Expand Down
1 change: 0 additions & 1 deletion nginx/sites-enabled/medcattrainer
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ server {
server_name localhost;
charset utf-8;
large_client_header_buffers 4 32k;
client_max_body_size 10G;

location /static {
alias /home/api/static;
Expand Down
2 changes: 2 additions & 0 deletions webapp/api/api/admin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@
admin.site.register(ExportedProject, ExportedProjectAdmin)
admin.site.register(ProjectMetrics, ProjectMetricsAdmin)
admin.site.register(Dataset, DatasetAdmin)
admin.site.register(ModelPack, ModelPackAdmin)
admin.site.register(MetaCATModel, MetaCATModelAdmin)
15 changes: 15 additions & 0 deletions webapp/api/api/admin/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,21 @@ class ConceptDBAdmin(admin.ModelAdmin):
actions = [import_concepts, delete_indexed_concepts, reset_cdb_filters]


class ModelPackAdmin(admin.ModelAdmin):
model = ModelPack
list_display = ('name', 'model_pack', 'concept_db', 'vocab', 'metacats')
fields = ['name', 'model_pack']

def metacats(self, obj):
return ", ".join(str(m_c) for m_c in obj.meta_cats.all())


class MetaCATModelAdmin(admin.ModelAdmin):
model = MetaCATModel
list_display = ('name', 'meta_cat_dir')
list_filter = ['meta_task']


class DocumentAdmin(admin.ModelAdmin):
model = Document
actions = [remove_all_documents]
Expand Down
1 change: 1 addition & 0 deletions webapp/api/api/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class ApiConfig(AppConfig):
def ready(self):
from api.views import _submit_document
from api.models import ProjectAnnotateEntities
from . import signals
resubmit_all = os.environ.get('RESUBMIT_ALL_ON_STARTUP', None)
if resubmit_all is not None and resubmit_all.lower() in ('1', 'y', 'true'):
logger.info('Found env var RESUBMIT_ALL_ON_STARTUP is True. '
Expand Down
34 changes: 34 additions & 0 deletions webapp/api/api/migrations/0078_metacatmodel_modelpack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Generated by Django 2.2.28 on 2024-06-05 23:39

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('api', '0077_projectgroup_create_associated_projects'),
]

operations = [
migrations.CreateModel(
name='MetaCATModel',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=100)),
('meta_cat_dir', models.FilePathField(allow_folders=True, help_text='The zip or dir for a MetaCAT model')),
('meta_task', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.MetaTask')),
],
),
migrations.CreateModel(
name='ModelPack',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.TextField()),
('model_pack', models.FileField(help_text='Model pack zip', upload_to='')),
('concept_db', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='api.ConceptDB')),
('meta_cats', models.ManyToManyField(blank=True, default=None, to='api.MetaCATModel')),
('vocab', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='api.Vocabulary')),
],
),
]
139 changes: 135 additions & 4 deletions webapp/api/api/models.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
import logging
import os
import shutil
from zipfile import BadZipFile

import pandas as pd
from django.conf import settings
from django.core.exceptions import ValidationError
from django.core.validators import RegexValidator
from django.db import models
from django.db.models import DO_NOTHING, SET_NULL
from django.dispatch import receiver
from django.forms import forms, ModelForm
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from polymorphic.models import PolymorphicModel

from core.settings import MEDIA_ROOT

STATUS_CHOICES = [
(0, 'Not Validated'),
(1, 'Validated'),
Expand All @@ -22,27 +32,93 @@

cdb_name_validator = RegexValidator(r'^[0-9a-zA-Z_-]*$', 'Only alpahanumeric characters, -, _ are allowed for CDB names')

logger = logging.getLogger(__name__)


class ModelPack(models.Model):
name = models.TextField(help_text='')
model_pack = models.FileField(help_text='Model pack zip')
concept_db = models.ForeignKey('ConceptDB', on_delete=models.CASCADE, blank=True, null=True)
vocab = models.ForeignKey('Vocabulary', on_delete=models.CASCADE, blank=True, null=True)
meta_cats = models.ManyToManyField('MetaCATModel', blank=True, default=None)

def save(self, *args, **kwargs):
super().save(*args, **kwargs)
logger.info('Loading model pack: %s', self.model_pack)
model_pack_name = str(self.model_pack).replace(".zip", "")
try:
CAT.attempt_unpack(self.model_pack.path)
except BadZipFile as exc:
# potential for CRC-32 errors in Trainer process - ignore and still use
logger.warning(f'Possibly corrupt cdb.dat decompressing {self.model_pack}\nFull Exception: {exc}')
unpacked_model_pack_path = self.model_pack.path.replace('.zip', '')
# attempt to load cdb
try:
CAT.load_cdb(unpacked_model_pack_path)
concept_db = ConceptDB()
unpacked_file_name = self.model_pack.file.name.replace('.zip', '')
concept_db.cdb_file.name = os.path.join(unpacked_file_name, 'cdb.dat')
concept_db.name = f'{self.name} - CDB'
concept_db.save(skip_load=True)
self.concept_db = concept_db
except Exception as exc:
raise FileNotFoundError(f'Error loading the CDB from this model pack: {self.model_pack.path}') from exc

# Load Vocab
vocab_path = os.path.join(unpacked_model_pack_path, "vocab.dat")
if os.path.exists(vocab_path):
Vocab.load(vocab_path)
vocab = Vocabulary()
vocab.vocab_file.name = vocab_path.replace(f'{MEDIA_ROOT}/', '')
vocab.save(skip_load=True)
self.vocab = vocab
else:
raise FileNotFoundError(f'Error loading the Vocab from this model pack: {vocab_path}')

# load MetaCATs
try:
# should raise an error if there already is a MetaCAT model with this definition
for meta_cat_dir, meta_cat in CAT.load_meta_cats(unpacked_model_pack_path):
mc_model = MetaCATModel()
mc_model.meta_cat_dir = meta_cat_dir.replace(f'{MEDIA_ROOT}/', '')
mc_model.name = f'{meta_cat.config.general.category_name} - {meta_cat.config.model.model_name}'
mc_model.save(unpack_load_meta_cat_dir=False)
mc_model.get_or_create_meta_tasks_and_values(meta_cat)
except Exception as exc:
raise MedCATLoadException(f'Failure loading MetaCAT models - {unpacked_model_pack_path}') from exc
super().save(*args, **kwargs)

def __str__(self):
return self.name


class ConceptDB(models.Model):
name = models.CharField(max_length=100, default='', blank=True, validators=[cdb_name_validator])
cdb_file = models.FileField()
use_for_training = models.BooleanField(default=True)

def __init__(self, *args, **kwargs):
super(ConceptDB, self).__init__(*args, **kwargs)
super().__init__(*args, **kwargs)
self.__cdb_field_name = None

@classmethod
def from_db(cls, db, field_names, values):
inst = super(ConceptDB, cls).from_db(db, field_names, values)
inst = super().from_db(db, field_names, values)
inst.__cdb_field_name = [v for f, v in zip(field_names, values) if f == 'cdb_file'][0]
return inst

def save(self, *args, **kwargs):
def save(self, *args, skip_load=False, **kwargs, ):
# load the CDB, and raise if this fails.
if not skip_load:
try:
CDB.load(self.cdb_file)
except Exception as exc:
raise MedCATLoadException(f'Failed to load Concept DB from {self.cdb_file}, '
f'check if this CDB file successfully loads elsewhere') from exc
if self.__cdb_field_name is not None and self.__cdb_field_name != self.cdb_file.name:
raise ValidationError('Cannot change file path of existing CDB.')
else:
super(ConceptDB, self).save(*args, **kwargs)
super().save(*args, **kwargs)

def __str__(self):
return self.name
Expand All @@ -51,10 +127,60 @@ def __str__(self):
class Vocabulary(models.Model):
vocab_file = models.FileField()

def save(self, *args, skip_load=False, **kwargs):
# load the Vocab, and raise if this fails
if not skip_load:
try:
Vocab.load(self.vocab_file)
except Exception as exc:
raise MedCATLoadException(f'Failed to load Vocab from {self.vocab_file}, '
f'check if this Vocab file successfully loads elsewhere') from exc
super().save(*args, **kwargs)

def __str__(self):
return str(self.vocab_file.name)


class MetaCATModel(models.Model):
name = models.CharField(max_length=100)
meta_cat_dir = models.FilePathField(help_text='The zip or dir for a MetaCAT model', allow_folders=True)
meta_task = models.ForeignKey('MetaTask', on_delete=SET_NULL, blank=True, null=True)

def get_or_create_meta_tasks_and_values(self, meta_cat: MetaCAT):
task = meta_cat.config.general.category_name
mt = MetaTask.objects.filter(name=task).first()
if not mt:
mt = MetaTask()
mt.name = task
mt.save()
self.meta_task = mt

mt_vs = []
for meta_task_value in meta_cat.config.general.category_value2id.keys():
mt_v = MetaTaskValue.objects.filter(name=meta_task_value).first()
if not mt_v:
mt_v = MetaTaskValue()
mt_v.name = meta_task_value
mt_v.save()
mt_vs.append(mt_v)
self.meta_task.values.set(mt_vs)

def save(self, *args, unpack_load_meta_cat_dir=False, **kwargs):
if unpack_load_meta_cat_dir:
try:
# load the meta cat model, raise if issues
model_files = os.path.join(MEDIA_ROOT, self.meta_cat_dir)
shutil.unpack_archive(self.meta_cat_dir, extract_dir=model_files)
MetaCAT.load(save_dir_path=model_files)
except Exception as exc:
raise MedCATLoadException(f'Failed to load MetaCAT from {self.meta_cat_dir}, '
f'check if this MetaCAT dir successfully loads elsewhere') from exc
super().save(*args, **kwargs)

def __str__(self):
return f'{self.name} - {str(self.meta_cat_dir)}'


class Dataset(models.Model):
name = models.CharField(max_length=150)
original_file = models.FileField()
Expand Down Expand Up @@ -355,3 +481,8 @@ def _remove_file(instance, prop):
if getattr(instance, prop):
if os.path.isfile(getattr(instance, prop).path):
os.remove(getattr(instance, prop).path)


class MedCATLoadException(Exception):
def __init__(self, message):
super().__init__(message)
12 changes: 12 additions & 0 deletions webapp/api/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ class Meta:
fields = '__all__'


class ModelPackSerializer(serializers.ModelSerializer):
class Meta:
model = ModelPack
fields = '__all__'


class MetaCATModelSerializer(serializers.ModelSerializer):
class Meta:
model = MetaCATModel
fields = '__all__'


class ConceptDBSerializer(serializers.ModelSerializer):
class Meta:
model = ConceptDB
Expand Down
25 changes: 24 additions & 1 deletion webapp/api/api/forms.py → webapp/api/api/signals.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import json
import logging
import os
import shutil

from django.db.models.fields.files import FileField
from django.db.models.signals import post_save, post_delete, pre_save
from django.dispatch import receiver

from api.data_utils import dataset_from_file, delete_orphan_docs, upload_projects_export
from api.models import Dataset, ExportedProject
from api.models import Dataset, ExportedProject, ModelPack
from core.settings import MEDIA_ROOT


logger = logging.getLogger(__name__)


@receiver(post_save, sender=Dataset)
Expand Down Expand Up @@ -36,3 +42,20 @@ def save_exported_projects(sender, instance, **kwargs):
if not instance.trainer_export_file.path.endswith('.json'):
raise Exception("Please make sure the file is a .json file")
upload_projects_export(json.load(open(instance.trainer_export_file.path)))


@receiver(post_delete, sender=ModelPack)
def remove_model_pack_assets(sender, instance, **kwargs):
if instance.concept_db:
instance.concept_db.delete(using=None, keep_parents=False)
if instance.vocab:
instance.vocab.delete(using=None, keep_parents=False)
if len(instance.meta_cats.all()) > 0:
for m_c in instance.meta_cats.all():
m_c.delete(using=None, keep_parents=False)
try:
# rm the model pack unzipped dir & model pack zip
shutil.rmtree(instance.model_pack.path.replace(".zip", ""))
os.remove(instance.model_pack.path)
except FileNotFoundError:
logger.warning("Failure removing Model pack dir or zip. Not found. Likely already deleted")
4 changes: 4 additions & 0 deletions webapp/api/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
vocab = Vocab.load(vocab_path)
VOCAB_MAP[vocab_id] = vocab

# integrated model-pack spacy model not used.
# This assumes specified spacy model is installed...
# Next change will create conditional params to load CDB / Vocab, or
# model-packs directly for a project.
cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
CAT_MAP[cat_id] = cat
return cat
Expand Down

0 comments on commit 062b509

Please sign in to comment.