Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Offline Speech Recognition #2089 #2242

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2d8bb93
Offline Speech Recognition #2089
VladislavAntonyuk Sep 30, 2024
67f44a3
Merge branch 'main' into 2089-offline-speech-recognition
VladislavAntonyuk Oct 1, 2024
9b7e48d
Offline Speech Recognition #2089 (#2258)
VladislavAntonyuk Oct 5, 2024
07c4ac8
Fix build
VladislavAntonyuk Oct 7, 2024
f3c1497
Merge branch 'main' into 2089-offline-speech-recognition
VladislavAntonyuk Oct 10, 2024
6c52500
Update according to comments
VladislavAntonyuk Oct 14, 2024
27bf6ae
Merge branch 'main' into 2089-offline-speech-recognition
VladislavAntonyuk Oct 14, 2024
e8a28b8
Fix tizen
VladislavAntonyuk Oct 14, 2024
02d322a
Merge branch 'main' into 2089-offline-speech-recognition
VladislavAntonyuk Oct 19, 2024
14facc0
Discard changes to samples/CommunityToolkit.Maui.Sample/CommunityTool…
VladislavAntonyuk Oct 19, 2024
4e8b436
Discard changes to global.json
VladislavAntonyuk Oct 19, 2024
285e477
Merge branch 'main' into 2089-offline-speech-recognition
VladislavAntonyuk Oct 21, 2024
eddfc71
Remove Task
VladislavAntonyuk Oct 25, 2024
79345ae
Merge remote-tracking branch 'origin/main' into 2089-offline-speech-r…
VladislavAntonyuk Oct 25, 2024
3f8b96f
Merge branch '2089-offline-speech-recognition' of https://github.com/…
VladislavAntonyuk Oct 25, 2024
67894fc
Fix tizen
VladislavAntonyuk Oct 25, 2024
b69b054
Update ISpeechToText.shared.cs
VladislavAntonyuk Oct 27, 2024
e995e8a
Update ISpeechToText.shared.cs
VladislavAntonyuk Oct 27, 2024
833c9c7
Update samples/CommunityToolkit.Maui.Sample/ViewModels/Essentials/Off…
VladislavAntonyuk Oct 27, 2024
fd3e1fb
Fix xml comment
VladislavAntonyuk Oct 27, 2024
7ddd7fe
Update sample
VladislavAntonyuk Nov 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@
SelectedItem="{Binding CurrentLocale}"
ItemDisplayBinding="{Binding ., Converter={StaticResource PickerLocaleDisplayConverter}}"/>

<Label
Text="State"
FontAttributes="Bold"/>

<Label
Text="{Binding State}"
FontSize="18"
HorizontalOptions="Center"
HorizontalTextAlignment="Center"
MinimumHeightRequest="100" />

<Label
Text="Language Output"
FontAttributes="Bold"/>
Expand All @@ -53,7 +64,6 @@
StrokeShape="RoundRectangle 8,8,8,8"
Padding="12">
<Border.Content>

<Grid RowDefinitions="*,60"
ColumnDefinitions="*,*"
RowSpacing="12"
Expand All @@ -62,21 +72,21 @@
<Button
Grid.Row="0"
Grid.Column="0"
Text="ListenAsync"
Command="{Binding ListenCommand}"
Text="StartListenAsync"
Command="{Binding StartListenCommand}"
HorizontalOptions="End" />

<Button
Grid.Row="0"
Grid.Column="1"
Text="Cancel Token"
Command="{Binding ListenCancelCommand}"
Text="StopListenAsync"
Command="{Binding StopListenCommand}"
HorizontalOptions="Start" />

<Label
Grid.Row="1"
Grid.ColumnSpan="2"
Text="The `ListenAsync` API allows you to await the final speech recognition results using async/await. `ListenAsync` is cancelled via CancellationToken."
Text="The `StartListenAsync` API starts the speech-to-text service and shares the results using `RecognitionResultUpdated` event and `RecognitionResultCompleted` event."
HorizontalOptions="Center"
HorizontalTextAlignment="Center"
FontSize="12"/>
Expand All @@ -99,21 +109,21 @@
<Button
Grid.Row="0"
Grid.Column="0"
Text="StartListenAsync"
Command="{Binding StartListenCommand}"
Text="StartOfflineListenAsync"
Command="{Binding StartOfflineListenCommand}"
HorizontalOptions="End" />

<Button
Grid.Row="0"
Grid.Column="1"
Text="StopListenAsync"
Command="{Binding StopListenCommand}"
Text="StopOfflineListenAsync"
Command="{Binding StopOfflineListenCommand}"
HorizontalOptions="Start" />

<Label
Grid.Row="1"
Grid.ColumnSpan="2"
Text="The `StartListenAsync` API starts the speech-to-text service and shares the results using `RecognitionResultUpdated` event and `RecognitionResultCompleted` event."
Text="The `StartOfflineListenAsync` API starts the speech-to-text service and shares the results using `RecognitionResultUpdated` event and `RecognitionResultCompleted` event."
HorizontalOptions="Center"
HorizontalTextAlignment="Center"
FontSize="12"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,23 @@ public partial class SpeechToTextViewModel : BaseViewModel
[ObservableProperty]
Locale? currentLocale;

public SpeechToTextState? State => speechToText.CurrentState;

[ObservableProperty]
string? recognitionText = "Welcome to .NET MAUI Community Toolkit!";

[ObservableProperty, NotifyCanExecuteChangedFor(nameof(ListenCommand))]
bool canListenExecute = true;


[ObservableProperty, NotifyCanExecuteChangedFor(nameof(StartListenCommand))]
bool canStartListenExecute = true;

[ObservableProperty, NotifyCanExecuteChangedFor(nameof(StopListenCommand))]
bool canStopListenExecute = false;

[ObservableProperty, NotifyCanExecuteChangedFor(nameof(StartOfflineListenCommand))]
bool canStartOfflineListenExecute = true;

[ObservableProperty, NotifyCanExecuteChangedFor(nameof(StopOfflineListenCommand))]
bool canStopOfflineListenExecute = false;

public SpeechToTextViewModel(ITextToSpeech textToSpeech, ISpeechToText speechToText)
{
this.textToSpeech = textToSpeech;
Expand Down Expand Up @@ -82,94 +87,94 @@ async Task Play(CancellationToken cancellationToken)
}
}

[RelayCommand(IncludeCancelCommand = true, CanExecute = nameof(CanListenExecute))]
async Task Listen(CancellationToken cancellationToken)
[RelayCommand(CanExecute = nameof(CanStartListenExecute))]
async Task StartListen()
{
CanStartListenExecute = false;
CanStartOfflineListenExecute = false;
CanStopOfflineListenExecute = false;
CanStopListenExecute = true;

try
var isGranted = await speechToText.RequestPermissions(CancellationToken.None);
if (!isGranted)
{
var isGranted = await speechToText.RequestPermissions(cancellationToken);
if (!isGranted)
{
await Toast.Make("Permission not granted").Show(cancellationToken);
return;
}

const string beginSpeakingPrompt = "Begin speaking...";
await Toast.Make("Permission not granted").Show(CancellationToken.None);
return;
}

RecognitionText = beginSpeakingPrompt;
if (Connectivity.NetworkAccess != NetworkAccess.Internet)
{
await Toast.Make("Internet connection is required").Show(CancellationToken.None);
return;
}

var recognitionResult = await speechToText.ListenAsync(
CultureInfo.GetCultureInfo(CurrentLocale?.Language ?? defaultLanguage),
new Progress<string>(partialText =>
{
if (RecognitionText is beginSpeakingPrompt)
{
RecognitionText = string.Empty;
}
const string beginSpeakingPrompt = "Begin speaking...";

RecognitionText += partialText + " ";
}), cancellationToken);
RecognitionText = beginSpeakingPrompt;

if (recognitionResult.IsSuccessful)
{
RecognitionText = recognitionResult.Text;
}
else
{
await Toast.Make(recognitionResult.Exception?.Message ?? "Unable to recognize speech").Show(CancellationToken.None);
}
speechToText.RecognitionResultUpdated += HandleRecognitionResultUpdated;

if (RecognitionText is beginSpeakingPrompt)
{
RecognitionText = string.Empty;
}
}
finally
await speechToText.StartListenAsync(CultureInfo.GetCultureInfo(CurrentLocale?.Language ?? defaultLanguage), CancellationToken.None);

if (RecognitionText is beginSpeakingPrompt)
{
CanStartListenExecute = true;
RecognitionText = string.Empty;
}
}

[RelayCommand(CanExecute = nameof(CanStartListenExecute))]
async Task StartListen(CancellationToken cancellationToken)
[RelayCommand(CanExecute = nameof(CanStopListenExecute))]
Task StopListen()
{
CanStartListenExecute = true;
CanStartOfflineListenExecute = true;
CanStopOfflineListenExecute = false;
CanStopListenExecute = false;

speechToText.RecognitionResultUpdated -= HandleRecognitionResultUpdated;

return speechToText.StopListenAsync(CancellationToken.None);
}

[RelayCommand(CanExecute = nameof(CanStartOfflineListenExecute))]
async Task StartOfflineListen()
{
CanListenExecute = false;
CanStartListenExecute = false;
CanStopListenExecute = true;
CanStopListenExecute = false;
CanStartOfflineListenExecute = false;
CanStopOfflineListenExecute = true;

var isGranted = await speechToText.RequestPermissions(cancellationToken);
var isGranted = await speechToText.RequestPermissions(CancellationToken.None);
if (!isGranted)
{
await Toast.Make("Permission not granted").Show(cancellationToken);
await Toast.Make("Permission not granted").Show(CancellationToken.None);
return;
}

const string beginSpeakingPrompt = "Begin speaking...";

RecognitionText = beginSpeakingPrompt;

await speechToText.StartListenAsync(CultureInfo.GetCultureInfo(CurrentLocale?.Language ?? defaultLanguage), cancellationToken);

speechToText.RecognitionResultUpdated += HandleRecognitionResultUpdated;

await speechToText.StartOfflineListenAsync(CultureInfo.GetCultureInfo(CurrentLocale?.Language ?? defaultLanguage), CancellationToken.None);

if (RecognitionText is beginSpeakingPrompt)
{
RecognitionText = string.Empty;
}
}

[RelayCommand(CanExecute = nameof(CanStopListenExecute))]
Task StopListen(CancellationToken cancellationToken)
[RelayCommand(CanExecute = nameof(CanStopOfflineListenExecute))]
Task StopOfflineListen()
{
CanListenExecute = true;
CanStartOfflineListenExecute = true;
CanStartListenExecute = true;
CanStopListenExecute = false;
CanStopOfflineListenExecute = false;

speechToText.RecognitionResultUpdated -= HandleRecognitionResultUpdated;

return speechToText.StopListenAsync(cancellationToken);
return speechToText.StopOfflineListenAsync(CancellationToken.None);
}

void HandleRecognitionResultUpdated(object? sender, SpeechToTextRecognitionResultUpdatedEventArgs e)
Expand All @@ -179,12 +184,12 @@ void HandleRecognitionResultUpdated(object? sender, SpeechToTextRecognitionResul

void HandleRecognitionResultCompleted(object? sender, SpeechToTextRecognitionResultCompletedEventArgs e)
{
RecognitionText = e.RecognitionResult;
RecognitionText = e.RecognitionResult.IsSuccessful ? e.RecognitionResult.Text : e.RecognitionResult.Exception.Message;
}

async void HandleSpeechToTextStateChanged(object? sender, SpeechToTextStateChangedEventArgs e)
void HandleSpeechToTextStateChanged(object? sender, SpeechToTextStateChangedEventArgs e)
{
await Toast.Make($"State Changed: {e.State}").Show(CancellationToken.None);
OnPropertyChanged(nameof(State));
}

void HandleLocalesCollectionChanged(object? sender, NotifyCollectionChangedEventArgs e)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ public class SpeechToTextRecognitionResultCompletedEventArgs : EventArgs
/// <summary>
/// Initialize a new instance of <see cref="SpeechToTextRecognitionResultCompletedEventArgs"/>
/// </summary>
public SpeechToTextRecognitionResultCompletedEventArgs(string recognitionResult)
public SpeechToTextRecognitionResultCompletedEventArgs(SpeechToTextResult recognitionResult)
{
RecognitionResult = recognitionResult;
}

/// <summary>
/// Speech recognition result
/// </summary>
public string RecognitionResult { get; }
public SpeechToTextResult RecognitionResult { get; }
}
bijington marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -28,32 +28,42 @@ public interface ISpeechToText : IAsyncDisposable
SpeechToTextState CurrentState { get; }

/// <summary>
/// Converts speech to text in real time.
/// Starts the SpeechToText service
/// </summary>
/// <remarks>
/// Real time speech recognition results will be surfaced via <see cref="RecognitionResultUpdated"/> and <see cref="RecognitionResultCompleted"/>
/// </remarks>
/// <param name="culture">Speak language</param>
/// <param name="recognitionResult">Intermediate recognition result.</param>
/// <param name="cancellationToken"><see cref="CancellationToken"/></param>
bijington marked this conversation as resolved.
Show resolved Hide resolved
VladislavAntonyuk marked this conversation as resolved.
Show resolved Hide resolved
/// <returns>Final recognition result</returns>
Task<SpeechToTextResult> ListenAsync(CultureInfo culture, IProgress<string>? recognitionResult, CancellationToken cancellationToken = default);
Task StartListenAsync(CultureInfo culture, CancellationToken cancellationToken = default);

/// <summary>
/// Starts the SpeechToText service
/// Stops the SpeechToText service
/// </summary>
/// <remarks>
/// Speech recognition results will be surfaced via <see cref="RecognitionResultCompleted"/>
/// </remarks>
/// <param name="cancellationToken"><see cref="CancellationToken"/></param>
Task StopListenAsync(CancellationToken cancellationToken = default);

/// <summary>
/// Starts the Offline SpeechToText service
/// </summary>
/// <remarks>
/// Real time speech recognition results will be surfaced via <see cref="RecognitionResultUpdated"/> and <see cref="RecognitionResultCompleted"/>
/// </remarks>
/// <param name="culture">Speak language</param>
/// <param name="cancellationToken"><see cref="CancellationToken"/></param>
Task StartListenAsync(CultureInfo culture, CancellationToken cancellationToken = default);
Task StartOfflineListenAsync(CultureInfo culture, CancellationToken cancellationToken = default);
VladislavAntonyuk marked this conversation as resolved.
Show resolved Hide resolved

/// <summary>
/// Stops the SpeechToText service
/// Stops the OfflineSpeechToText service
/// </summary>
/// <remarks>
/// Speech recognition results will be surfaced via <see cref="RecognitionResultCompleted"/>
/// </remarks>
/// <param name="cancellationToken"><see cref="CancellationToken"/></param>
Task StopListenAsync(CancellationToken cancellationToken = default);
Task StopOfflineListenAsync(CancellationToken cancellationToken = default);

/// <summary>
/// Request permissions for speech to text.
Expand Down
Loading