diff --git a/edxval/api.py b/edxval/api.py index 6141ec3d..03480362 100644 --- a/edxval/api.py +++ b/edxval/api.py @@ -3,6 +3,7 @@ """ The internal API for VAL. """ +import os import logging from enum import Enum from uuid import uuid4 @@ -17,15 +18,35 @@ from lxml.etree import Element, SubElement from pysrt.srtexc import Error -from edxval.exceptions import (InvalidTranscriptFormat, - InvalidTranscriptProvider, ValCannotCreateError, - ValCannotUpdateError, ValInternalError, - ValVideoNotFoundError) -from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference, - TranscriptProviderType, Video, VideoImage, - VideoTranscript, ThirdPartyTranscriptCredentialsState) +from edxval.exceptions import ( + InvalidTranscriptFormat, + TranscriptsGenerationException, + InvalidTranscriptProvider, + ValCannotCreateError, + ValCannotUpdateError, + ValInternalError, + ValVideoNotFoundError, +) +from edxval.models import ( + CourseVideo, + EncodedVideo, + Profile, + TranscriptPreference, + TranscriptProviderType, + Video, + VideoImage, + VideoTranscript, + ThirdPartyTranscriptCredentialsState, +) from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer -from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format +from edxval.utils import ( + TranscriptFormat, + THIRD_PARTY_TRANSCRIPTION_PLANS, + create_file_in_fs, + get_transcript_format, +) + +from edxval.transcript_utils import Transcript logger = logging.getLogger(__name__) # pylint: disable=C0103 @@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None): for name in ['profile', 'url', 'file_size', 'bitrate'] } ) + return create_transcripts_xml(video_id, video_el, resource_fs, static_dir) @@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st static_dir (str): The Directory to store transcript file. resource_fs (SubFS): The file system to store transcripts. """ - transcript_name = u'{video_id}-{language_code}.{file_format}'.format( + transcript_filename = '{video_id}-{language_code}.srt'.format( video_id=video_id, - language_code=language_code, - file_format=file_format + language_code=language_code ) transcript_data = get_video_transcript_data(video_id, language_code) if transcript_data: - transcript_content = transcript_data['content'] - create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir) + transcript_content = Transcript.convert( + transcript_data['content'], + input_format=file_format, + output_format=Transcript.SRT + ) + create_file_in_fs(transcript_content, transcript_filename, resource_fs, static_dir) + + return transcript_filename def create_transcripts_xml(video_id, video_el, resource_fs, static_dir): """ Creates xml for transcripts. - For each transcript elment, an associated transcript file is also created in course OLX. + For each transcript element, an associated transcript file is also created in course OLX. Arguments: video_id (str): Video id of the video. @@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir): if video_transcripts.exists(): transcripts_el = SubElement(video_el, 'transcripts') - exported_language_codes = [] + transcript_files_map = {} for video_transcript in video_transcripts: - if video_transcript.language_code not in exported_language_codes: - language_code = video_transcript.language_code - file_format = video_transcript.file_format + language_code = video_transcript.language_code + file_format = video_transcript.file_format - create_transcript_file( - video_id, - language_code, - file_format, - resource_fs.delegate_fs(), - combine(u'course', static_dir) # File system should not start from /draft directory. + try: + transcript_filename = create_transcript_file( + video_id=video_id, + language_code=language_code, + file_format=file_format, + resource_fs=resource_fs.delegate_fs(), + static_dir=combine(u'course', static_dir) # File system should not start from /draft directory. ) + transcript_files_map[language_code] = transcript_filename + except TranscriptsGenerationException: + # we don't want to halt export in this case, just log and move to the next transcript. + logger.exception('[VAL] Error while generating "%s" transcript for video["%s"].', language_code, video_id) + continue - SubElement( - transcripts_el, - 'transcript', - { - 'language_code': language_code, - 'file_format': file_format, - 'provider': video_transcript.provider, - } - ) - exported_language_codes.append(video_transcript.language_code) + SubElement( + transcripts_el, + 'transcript', + { + 'language_code': language_code, + 'file_format': Transcript.SRT, + 'provider': video_transcript.provider, + } + ) - return video_el + return dict(xml=video_el, transcripts=transcript_files_map) def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None): @@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider, ) return - # Get file format from transcript content. try: file_format = get_transcript_format(file_content) diff --git a/edxval/exceptions.py b/edxval/exceptions.py index 4ae9e98a..13446001 100644 --- a/edxval/exceptions.py +++ b/edxval/exceptions.py @@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError): This error is raised when an transcript provider is not supported """ pass + + +class TranscriptsGenerationException(ValError): + """ + This error is raised when a transcript content is not parse-able in specified format. + """ + pass diff --git a/edxval/tests/test_api.py b/edxval/tests/test_api.py index 5f54151a..690dd17d 100644 --- a/edxval/tests/test_api.py +++ b/edxval/tests/test_api.py @@ -35,6 +35,7 @@ VideoTranscript) from edxval.serializers import VideoSerializer from edxval.tests import APIAuthTestCase, constants +from edxval.transcript_utils import Transcript def omit_attrs(dict, attrs_to_omit=[]): @@ -973,11 +974,14 @@ def test_no_encodings(self): expected = self.parse_xml(""" """) - self.assert_xml_equal( - api.export_to_xml(constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR), - expected + exported_metadata = api.export_to_xml( + resource_fs=self.file_system, + static_dir=constants.EXPORT_IMPORT_STATIC_DIR, + video_id=constants.VIDEO_DICT_STAR['edx_video_id'], ) + self.assert_xml_equal(exported_metadata['xml'], expected) + def test_no_video_transcript(self): """ Verify that transcript export for video with no transcript is working as expected. @@ -986,11 +990,12 @@ def test_no_video_transcript(self): """) - exported_xml = api.export_to_xml( + exported_metadata = api.export_to_xml( constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR ) + exported_xml = exported_metadata['xml'] self.assert_xml_equal(exported_xml, expected) # Verify that no transcript is present in the XML. @@ -1011,29 +1016,29 @@ def test_basic(self, course_id, image): - + """.format(image=image)) - self.assert_xml_equal( - api.export_to_xml( - constants.VIDEO_DICT_FISH['edx_video_id'], - self.file_system, - constants.EXPORT_IMPORT_STATIC_DIR, - course_id - ), - expected + exported_metadata = api.export_to_xml( + constants.VIDEO_DICT_FISH['edx_video_id'], + self.file_system, + constants.EXPORT_IMPORT_STATIC_DIR, + course_id ) + self.assert_xml_equal(exported_metadata['xml'], expected) + self.assertItemsEqual(exported_metadata['transcripts'].keys(), ['en', 'de']) + def test_transcript_export(self): """ Test that transcript are exported correctly. """ language_code = 'en' video_id = constants.VIDEO_DICT_FISH['edx_video_id'] - transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'} + transcript_files = {'de': u'super-soaker-de.srt', 'en': u'super-soaker-en.srt'} expected_transcript_path = combine( self.temp_dir, combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR) @@ -1045,16 +1050,21 @@ def test_transcript_export(self): - + """) - exported_xml = api.export_to_xml(video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, 'test-course') + exported_metadata = api.export_to_xml( + video_id=video_id, + course_id='test-course', + resource_fs=self.file_system, + static_dir=constants.EXPORT_IMPORT_STATIC_DIR + ) # Assert video and transcript xml is exported correctly. - self.assert_xml_equal(exported_xml, expected_xml) + self.assert_xml_equal(exported_metadata['xml'], expected_xml) # Verify transcript file is created. self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR)) @@ -1065,7 +1075,13 @@ def test_transcript_export(self): open(combine(expected_transcript_path, transcript_files[language_code])) ).read() transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code) - self.assertEqual(transcript['content'], expected_transcript_content) + transcript_format = os.path.splitext(transcript['file_name'])[1][1:] + exported_transcript_content = Transcript.convert( + transcript['content'], + input_format=transcript_format, + output_format=Transcript.SRT, + ).encode('utf-8') + self.assertEqual(exported_transcript_content, expected_transcript_content) def test_unknown_video(self): @@ -1728,20 +1744,18 @@ def test_import_transcript_from_fs_bad_content(self, mock_logger): """ language_code = 'en' edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id'] - # First create transcript file. + + # First create non utf-8 encoded transcript file in the file system. transcript_file_name = 'invalid-transcript.txt' invalid_transcript = dict( constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON, video_id=edx_video_id, - file_data=u'Привіт, edX вітає вас.'.encode('cp1251') - ) - utils.create_file_in_fs( - invalid_transcript['file_data'], - transcript_file_name, - self.file_system, - constants.EXPORT_IMPORT_STATIC_DIR + file_data=u'Привіт, edX вітає вас.' ) + with self.file_system.open(combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name), 'wb') as f: + f.write(invalid_transcript['file_data'].encode('cp1251')) + api.import_transcript_from_fs( edx_video_id=edx_video_id, language_code=language_code, diff --git a/edxval/tests/test_transcript_utils.py b/edxval/tests/test_transcript_utils.py new file mode 100644 index 00000000..cd14eb79 --- /dev/null +++ b/edxval/tests/test_transcript_utils.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +""" +Tests for transcript utils. +""" +import ddt +import json +import textwrap +import unittest + +from edxval.transcript_utils import Transcript +from edxval.exceptions import TranscriptsGenerationException + + +@ddt.ddt +class TestTranscriptUtils(unittest.TestCase): + """ + Tests transcripts conversion util. + """ + def setUp(self): + super(TestTranscriptUtils, self).setUp() + + self.srt_transcript = textwrap.dedent("""\ + 0 + 00:00:10,500 --> 00:00:13,000 + Elephant's Dream 大象的梦想 + + 1 + 00:00:15,000 --> 00:00:18,000 + At the left we can see... + + """) + + self.sjson_transcript = textwrap.dedent("""\ + { + "start": [ + 10500, + 15000 + ], + "end": [ + 13000, + 18000 + ], + "text": [ + "Elephant's Dream 大象的梦想", + "At the left we can see..." + ] + } + """) + + @ddt.data( + ('invalid_input_format', 'sjson'), + ('sjson', 'invalid_output_format'), + ('invalid_input_format', 'invalid_output_format') + ) + @ddt.unpack + def test_invalid_transcript_format(self, input_format, output_format): + """ + Tests that transcript conversion raises `AssertionError` on invalid input/output formats. + """ + with self.assertRaises(AssertionError): + Transcript.convert(self.sjson_transcript, input_format, output_format) + + def test_convert_srt_to_srt(self): + """ + Tests that srt to srt conversion works as expected. + """ + expected = self.srt_transcript.decode('utf-8') + actual = Transcript.convert(self.srt_transcript, 'srt', 'srt') + self.assertEqual(actual, expected) + + def test_convert_sjson_to_srt(self): + """ + Tests that the sjson transcript is successfully converted into srt format. + """ + expected = self.srt_transcript.decode('utf-8') + actual = Transcript.convert(self.sjson_transcript, 'sjson', 'srt') + self.assertEqual(actual, expected) + + def test_convert_srt_to_sjson(self): + """ + Tests that the srt transcript is successfully converted into sjson format. + """ + expected = self.sjson_transcript.decode('utf-8') + actual = Transcript.convert(self.srt_transcript, 'srt', 'sjson') + self.assertDictEqual(json.loads(actual), json.loads(expected)) + + def test_convert_invalid_srt_to_sjson(self): + """ + Tests that TranscriptsGenerationException was raises on trying + to convert invalid srt transcript to sjson. + """ + invalid_srt_transcript = 'invalid SubRip file content' + with self.assertRaises(TranscriptsGenerationException): + Transcript.convert(invalid_srt_transcript, 'srt', 'sjson') diff --git a/edxval/transcript_utils.py b/edxval/transcript_utils.py new file mode 100644 index 00000000..058158e7 --- /dev/null +++ b/edxval/transcript_utils.py @@ -0,0 +1,117 @@ +""" +A module containing transcripts utils. +""" +import json +from six import text_type + +from pysrt import SubRipFile, SubRipItem, SubRipTime +from pysrt.srtexc import Error + +from edxval.exceptions import TranscriptsGenerationException + + +class Transcript(object): + """ + Container for transcript methods. + """ + SRT = 'srt' + SJSON = 'sjson' + + @staticmethod + def generate_sjson_from_srt(srt_subs): + """ + Generate transcripts from sjson to SubRip (*.srt). + + Arguments: + srt_subs(SubRip): "SRT" subs object + + Returns: + Subs converted to "SJSON" format. + """ + sub_starts = [] + sub_ends = [] + sub_texts = [] + for sub in srt_subs: + sub_starts.append(sub.start.ordinal) + sub_ends.append(sub.end.ordinal) + sub_texts.append(sub.text.replace('\n', ' ')) + + sjson_subs = { + 'start': sub_starts, + 'end': sub_ends, + 'text': sub_texts + } + return sjson_subs + + @staticmethod + def generate_srt_from_sjson(sjson_subs): + """ + Generate transcripts from sjson to SubRip (*.srt). + + Arguments: + sjson_subs (dict): `sjson` subs. + + Returns: + Subtitles in SRT format. + """ + + output = '' + + equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text']) + if not equal_len: + return output + + for i in range(len(sjson_subs['start'])): + item = SubRipItem( + index=i, + start=SubRipTime(milliseconds=sjson_subs['start'][i]), + end=SubRipTime(milliseconds=sjson_subs['end'][i]), + text=sjson_subs['text'][i] + ) + output += (unicode(item)) + output += '\n' + return output + + @classmethod + def convert(cls, content, input_format, output_format): + """ + Convert transcript `content` from `input_format` to `output_format`. + + Arguments: + content: Transcript content byte-stream. + input_format: Input transcript format. + output_format: Output transcript format. + + Accepted input formats: sjson, srt. + Accepted output format: srt, sjson. + + Raises: + TranscriptsGenerationException: On parsing the invalid srt + content during conversion from srt to sjson. + """ + assert input_format in ('srt', 'sjson') + assert output_format in ('srt', 'sjson') + + # Decode the content with utf-8-sig which will also + # skip byte order mark(BOM) character if found. + content = content.decode('utf-8-sig') + + if input_format == output_format: + return content + + if input_format == 'srt': + + if output_format == 'sjson': + try: + # With error handling (set to 'ERROR_RAISE'), we will be getting + # the exception if something went wrong in parsing the transcript. + srt_subs = SubRipFile.from_string(content, error_handling=SubRipFile.ERROR_RAISE) + except Error as ex: # Base exception from pysrt + raise TranscriptsGenerationException(text_type(ex)) + + return json.dumps(cls.generate_sjson_from_srt(srt_subs)) + + if input_format == 'sjson': + + if output_format == 'srt': + return cls.generate_srt_from_sjson(json.loads(content)) diff --git a/edxval/utils.py b/edxval/utils.py index 795c0344..10c5e56f 100644 --- a/edxval/utils.py +++ b/edxval/utils.py @@ -192,11 +192,11 @@ def create_file_in_fs(file_data, file_name, file_system, static_dir): Arguments: file_data (str): Data to store into the file. file_name (str): File name of the file to be created. - resource_fs (OSFS): Import file system. + file_system (OSFS): Import file system. static_dir (str): The Directory to retrieve transcript file. """ with file_system.open(combine(static_dir, file_name), 'wb') as f: - f.write(file_data) + f.write(file_data.encode('utf-8')) def get_transcript_format(transcript_content): diff --git a/setup.py b/setup.py index 5e019c7f..23a0dc2b 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ def load_requirements(*requirements_paths): setup( name='edxval', - version='0.1.14', + version='0.1.15', author='edX', url='http://github.com/edx/edx-val', description='edx-val',