diff --git a/edxval/api.py b/edxval/api.py
index 6141ec3d..03480362 100644
--- a/edxval/api.py
+++ b/edxval/api.py
@@ -3,6 +3,7 @@
"""
The internal API for VAL.
"""
+import os
import logging
from enum import Enum
from uuid import uuid4
@@ -17,15 +18,35 @@
from lxml.etree import Element, SubElement
from pysrt.srtexc import Error
-from edxval.exceptions import (InvalidTranscriptFormat,
- InvalidTranscriptProvider, ValCannotCreateError,
- ValCannotUpdateError, ValInternalError,
- ValVideoNotFoundError)
-from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference,
- TranscriptProviderType, Video, VideoImage,
- VideoTranscript, ThirdPartyTranscriptCredentialsState)
+from edxval.exceptions import (
+ InvalidTranscriptFormat,
+ TranscriptsGenerationException,
+ InvalidTranscriptProvider,
+ ValCannotCreateError,
+ ValCannotUpdateError,
+ ValInternalError,
+ ValVideoNotFoundError,
+)
+from edxval.models import (
+ CourseVideo,
+ EncodedVideo,
+ Profile,
+ TranscriptPreference,
+ TranscriptProviderType,
+ Video,
+ VideoImage,
+ VideoTranscript,
+ ThirdPartyTranscriptCredentialsState,
+)
from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
-from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format
+from edxval.utils import (
+ TranscriptFormat,
+ THIRD_PARTY_TRANSCRIPTION_PLANS,
+ create_file_in_fs,
+ get_transcript_format,
+)
+
+from edxval.transcript_utils import Transcript
logger = logging.getLogger(__name__) # pylint: disable=C0103
@@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
for name in ['profile', 'url', 'file_size', 'bitrate']
}
)
+
return create_transcripts_xml(video_id, video_el, resource_fs, static_dir)
@@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st
static_dir (str): The Directory to store transcript file.
resource_fs (SubFS): The file system to store transcripts.
"""
- transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
+ transcript_filename = '{video_id}-{language_code}.srt'.format(
video_id=video_id,
- language_code=language_code,
- file_format=file_format
+ language_code=language_code
)
transcript_data = get_video_transcript_data(video_id, language_code)
if transcript_data:
- transcript_content = transcript_data['content']
- create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir)
+ transcript_content = Transcript.convert(
+ transcript_data['content'],
+ input_format=file_format,
+ output_format=Transcript.SRT
+ )
+ create_file_in_fs(transcript_content, transcript_filename, resource_fs, static_dir)
+
+ return transcript_filename
def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
"""
Creates xml for transcripts.
- For each transcript elment, an associated transcript file is also created in course OLX.
+ For each transcript element, an associated transcript file is also created in course OLX.
Arguments:
video_id (str): Video id of the video.
@@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
if video_transcripts.exists():
transcripts_el = SubElement(video_el, 'transcripts')
- exported_language_codes = []
+ transcript_files_map = {}
for video_transcript in video_transcripts:
- if video_transcript.language_code not in exported_language_codes:
- language_code = video_transcript.language_code
- file_format = video_transcript.file_format
+ language_code = video_transcript.language_code
+ file_format = video_transcript.file_format
- create_transcript_file(
- video_id,
- language_code,
- file_format,
- resource_fs.delegate_fs(),
- combine(u'course', static_dir) # File system should not start from /draft directory.
+ try:
+ transcript_filename = create_transcript_file(
+ video_id=video_id,
+ language_code=language_code,
+ file_format=file_format,
+ resource_fs=resource_fs.delegate_fs(),
+ static_dir=combine(u'course', static_dir) # File system should not start from /draft directory.
)
+ transcript_files_map[language_code] = transcript_filename
+ except TranscriptsGenerationException:
+ # we don't want to halt export in this case, just log and move to the next transcript.
+ logger.exception('[VAL] Error while generating "%s" transcript for video["%s"].', language_code, video_id)
+ continue
- SubElement(
- transcripts_el,
- 'transcript',
- {
- 'language_code': language_code,
- 'file_format': file_format,
- 'provider': video_transcript.provider,
- }
- )
- exported_language_codes.append(video_transcript.language_code)
+ SubElement(
+ transcripts_el,
+ 'transcript',
+ {
+ 'language_code': language_code,
+ 'file_format': Transcript.SRT,
+ 'provider': video_transcript.provider,
+ }
+ )
- return video_el
+ return dict(xml=video_el, transcripts=transcript_files_map)
def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None):
@@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider,
)
return
-
# Get file format from transcript content.
try:
file_format = get_transcript_format(file_content)
diff --git a/edxval/exceptions.py b/edxval/exceptions.py
index 4ae9e98a..13446001 100644
--- a/edxval/exceptions.py
+++ b/edxval/exceptions.py
@@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError):
This error is raised when an transcript provider is not supported
"""
pass
+
+
+class TranscriptsGenerationException(ValError):
+ """
+ This error is raised when a transcript content is not parse-able in specified format.
+ """
+ pass
diff --git a/edxval/tests/test_api.py b/edxval/tests/test_api.py
index 5f54151a..690dd17d 100644
--- a/edxval/tests/test_api.py
+++ b/edxval/tests/test_api.py
@@ -35,6 +35,7 @@
VideoTranscript)
from edxval.serializers import VideoSerializer
from edxval.tests import APIAuthTestCase, constants
+from edxval.transcript_utils import Transcript
def omit_attrs(dict, attrs_to_omit=[]):
@@ -973,11 +974,14 @@ def test_no_encodings(self):
expected = self.parse_xml("""
""")
- self.assert_xml_equal(
- api.export_to_xml(constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR),
- expected
+ exported_metadata = api.export_to_xml(
+ resource_fs=self.file_system,
+ static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
+ video_id=constants.VIDEO_DICT_STAR['edx_video_id'],
)
+ self.assert_xml_equal(exported_metadata['xml'], expected)
+
def test_no_video_transcript(self):
"""
Verify that transcript export for video with no transcript is working as expected.
@@ -986,11 +990,12 @@ def test_no_video_transcript(self):
""")
- exported_xml = api.export_to_xml(
+ exported_metadata = api.export_to_xml(
constants.VIDEO_DICT_STAR['edx_video_id'],
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR
)
+ exported_xml = exported_metadata['xml']
self.assert_xml_equal(exported_xml, expected)
# Verify that no transcript is present in the XML.
@@ -1011,29 +1016,29 @@ def test_basic(self, course_id, image):
-
+
""".format(image=image))
- self.assert_xml_equal(
- api.export_to_xml(
- constants.VIDEO_DICT_FISH['edx_video_id'],
- self.file_system,
- constants.EXPORT_IMPORT_STATIC_DIR,
- course_id
- ),
- expected
+ exported_metadata = api.export_to_xml(
+ constants.VIDEO_DICT_FISH['edx_video_id'],
+ self.file_system,
+ constants.EXPORT_IMPORT_STATIC_DIR,
+ course_id
)
+ self.assert_xml_equal(exported_metadata['xml'], expected)
+ self.assertItemsEqual(exported_metadata['transcripts'].keys(), ['en', 'de'])
+
def test_transcript_export(self):
"""
Test that transcript are exported correctly.
"""
language_code = 'en'
video_id = constants.VIDEO_DICT_FISH['edx_video_id']
- transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'}
+ transcript_files = {'de': u'super-soaker-de.srt', 'en': u'super-soaker-en.srt'}
expected_transcript_path = combine(
self.temp_dir,
combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR)
@@ -1045,16 +1050,21 @@ def test_transcript_export(self):
-
+
""")
- exported_xml = api.export_to_xml(video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, 'test-course')
+ exported_metadata = api.export_to_xml(
+ video_id=video_id,
+ course_id='test-course',
+ resource_fs=self.file_system,
+ static_dir=constants.EXPORT_IMPORT_STATIC_DIR
+ )
# Assert video and transcript xml is exported correctly.
- self.assert_xml_equal(exported_xml, expected_xml)
+ self.assert_xml_equal(exported_metadata['xml'], expected_xml)
# Verify transcript file is created.
self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR))
@@ -1065,7 +1075,13 @@ def test_transcript_export(self):
open(combine(expected_transcript_path, transcript_files[language_code]))
).read()
transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code)
- self.assertEqual(transcript['content'], expected_transcript_content)
+ transcript_format = os.path.splitext(transcript['file_name'])[1][1:]
+ exported_transcript_content = Transcript.convert(
+ transcript['content'],
+ input_format=transcript_format,
+ output_format=Transcript.SRT,
+ ).encode('utf-8')
+ self.assertEqual(exported_transcript_content, expected_transcript_content)
def test_unknown_video(self):
@@ -1728,20 +1744,18 @@ def test_import_transcript_from_fs_bad_content(self, mock_logger):
"""
language_code = 'en'
edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
- # First create transcript file.
+
+ # First create non utf-8 encoded transcript file in the file system.
transcript_file_name = 'invalid-transcript.txt'
invalid_transcript = dict(
constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
video_id=edx_video_id,
- file_data=u'Привіт, edX вітає вас.'.encode('cp1251')
- )
- utils.create_file_in_fs(
- invalid_transcript['file_data'],
- transcript_file_name,
- self.file_system,
- constants.EXPORT_IMPORT_STATIC_DIR
+ file_data=u'Привіт, edX вітає вас.'
)
+ with self.file_system.open(combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name), 'wb') as f:
+ f.write(invalid_transcript['file_data'].encode('cp1251'))
+
api.import_transcript_from_fs(
edx_video_id=edx_video_id,
language_code=language_code,
diff --git a/edxval/tests/test_transcript_utils.py b/edxval/tests/test_transcript_utils.py
new file mode 100644
index 00000000..cd14eb79
--- /dev/null
+++ b/edxval/tests/test_transcript_utils.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for transcript utils.
+"""
+import ddt
+import json
+import textwrap
+import unittest
+
+from edxval.transcript_utils import Transcript
+from edxval.exceptions import TranscriptsGenerationException
+
+
+@ddt.ddt
+class TestTranscriptUtils(unittest.TestCase):
+ """
+ Tests transcripts conversion util.
+ """
+ def setUp(self):
+ super(TestTranscriptUtils, self).setUp()
+
+ self.srt_transcript = textwrap.dedent("""\
+ 0
+ 00:00:10,500 --> 00:00:13,000
+ Elephant's Dream 大象的梦想
+
+ 1
+ 00:00:15,000 --> 00:00:18,000
+ At the left we can see...
+
+ """)
+
+ self.sjson_transcript = textwrap.dedent("""\
+ {
+ "start": [
+ 10500,
+ 15000
+ ],
+ "end": [
+ 13000,
+ 18000
+ ],
+ "text": [
+ "Elephant's Dream 大象的梦想",
+ "At the left we can see..."
+ ]
+ }
+ """)
+
+ @ddt.data(
+ ('invalid_input_format', 'sjson'),
+ ('sjson', 'invalid_output_format'),
+ ('invalid_input_format', 'invalid_output_format')
+ )
+ @ddt.unpack
+ def test_invalid_transcript_format(self, input_format, output_format):
+ """
+ Tests that transcript conversion raises `AssertionError` on invalid input/output formats.
+ """
+ with self.assertRaises(AssertionError):
+ Transcript.convert(self.sjson_transcript, input_format, output_format)
+
+ def test_convert_srt_to_srt(self):
+ """
+ Tests that srt to srt conversion works as expected.
+ """
+ expected = self.srt_transcript.decode('utf-8')
+ actual = Transcript.convert(self.srt_transcript, 'srt', 'srt')
+ self.assertEqual(actual, expected)
+
+ def test_convert_sjson_to_srt(self):
+ """
+ Tests that the sjson transcript is successfully converted into srt format.
+ """
+ expected = self.srt_transcript.decode('utf-8')
+ actual = Transcript.convert(self.sjson_transcript, 'sjson', 'srt')
+ self.assertEqual(actual, expected)
+
+ def test_convert_srt_to_sjson(self):
+ """
+ Tests that the srt transcript is successfully converted into sjson format.
+ """
+ expected = self.sjson_transcript.decode('utf-8')
+ actual = Transcript.convert(self.srt_transcript, 'srt', 'sjson')
+ self.assertDictEqual(json.loads(actual), json.loads(expected))
+
+ def test_convert_invalid_srt_to_sjson(self):
+ """
+ Tests that TranscriptsGenerationException was raises on trying
+ to convert invalid srt transcript to sjson.
+ """
+ invalid_srt_transcript = 'invalid SubRip file content'
+ with self.assertRaises(TranscriptsGenerationException):
+ Transcript.convert(invalid_srt_transcript, 'srt', 'sjson')
diff --git a/edxval/transcript_utils.py b/edxval/transcript_utils.py
new file mode 100644
index 00000000..058158e7
--- /dev/null
+++ b/edxval/transcript_utils.py
@@ -0,0 +1,117 @@
+"""
+A module containing transcripts utils.
+"""
+import json
+from six import text_type
+
+from pysrt import SubRipFile, SubRipItem, SubRipTime
+from pysrt.srtexc import Error
+
+from edxval.exceptions import TranscriptsGenerationException
+
+
+class Transcript(object):
+ """
+ Container for transcript methods.
+ """
+ SRT = 'srt'
+ SJSON = 'sjson'
+
+ @staticmethod
+ def generate_sjson_from_srt(srt_subs):
+ """
+ Generate transcripts from sjson to SubRip (*.srt).
+
+ Arguments:
+ srt_subs(SubRip): "SRT" subs object
+
+ Returns:
+ Subs converted to "SJSON" format.
+ """
+ sub_starts = []
+ sub_ends = []
+ sub_texts = []
+ for sub in srt_subs:
+ sub_starts.append(sub.start.ordinal)
+ sub_ends.append(sub.end.ordinal)
+ sub_texts.append(sub.text.replace('\n', ' '))
+
+ sjson_subs = {
+ 'start': sub_starts,
+ 'end': sub_ends,
+ 'text': sub_texts
+ }
+ return sjson_subs
+
+ @staticmethod
+ def generate_srt_from_sjson(sjson_subs):
+ """
+ Generate transcripts from sjson to SubRip (*.srt).
+
+ Arguments:
+ sjson_subs (dict): `sjson` subs.
+
+ Returns:
+ Subtitles in SRT format.
+ """
+
+ output = ''
+
+ equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text'])
+ if not equal_len:
+ return output
+
+ for i in range(len(sjson_subs['start'])):
+ item = SubRipItem(
+ index=i,
+ start=SubRipTime(milliseconds=sjson_subs['start'][i]),
+ end=SubRipTime(milliseconds=sjson_subs['end'][i]),
+ text=sjson_subs['text'][i]
+ )
+ output += (unicode(item))
+ output += '\n'
+ return output
+
+ @classmethod
+ def convert(cls, content, input_format, output_format):
+ """
+ Convert transcript `content` from `input_format` to `output_format`.
+
+ Arguments:
+ content: Transcript content byte-stream.
+ input_format: Input transcript format.
+ output_format: Output transcript format.
+
+ Accepted input formats: sjson, srt.
+ Accepted output format: srt, sjson.
+
+ Raises:
+ TranscriptsGenerationException: On parsing the invalid srt
+ content during conversion from srt to sjson.
+ """
+ assert input_format in ('srt', 'sjson')
+ assert output_format in ('srt', 'sjson')
+
+ # Decode the content with utf-8-sig which will also
+ # skip byte order mark(BOM) character if found.
+ content = content.decode('utf-8-sig')
+
+ if input_format == output_format:
+ return content
+
+ if input_format == 'srt':
+
+ if output_format == 'sjson':
+ try:
+ # With error handling (set to 'ERROR_RAISE'), we will be getting
+ # the exception if something went wrong in parsing the transcript.
+ srt_subs = SubRipFile.from_string(content, error_handling=SubRipFile.ERROR_RAISE)
+ except Error as ex: # Base exception from pysrt
+ raise TranscriptsGenerationException(text_type(ex))
+
+ return json.dumps(cls.generate_sjson_from_srt(srt_subs))
+
+ if input_format == 'sjson':
+
+ if output_format == 'srt':
+ return cls.generate_srt_from_sjson(json.loads(content))
diff --git a/edxval/utils.py b/edxval/utils.py
index 795c0344..10c5e56f 100644
--- a/edxval/utils.py
+++ b/edxval/utils.py
@@ -192,11 +192,11 @@ def create_file_in_fs(file_data, file_name, file_system, static_dir):
Arguments:
file_data (str): Data to store into the file.
file_name (str): File name of the file to be created.
- resource_fs (OSFS): Import file system.
+ file_system (OSFS): Import file system.
static_dir (str): The Directory to retrieve transcript file.
"""
with file_system.open(combine(static_dir, file_name), 'wb') as f:
- f.write(file_data)
+ f.write(file_data.encode('utf-8'))
def get_transcript_format(transcript_content):
diff --git a/setup.py b/setup.py
index 5e019c7f..23a0dc2b 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ def load_requirements(*requirements_paths):
setup(
name='edxval',
- version='0.1.14',
+ version='0.1.15',
author='edX',
url='http://github.com/edx/edx-val',
description='edx-val',