From ceff768e4510f37485e8cde46e918caa5fc3bb67 Mon Sep 17 00:00:00 2001 From: Philip Giuliani Date: Fri, 17 Nov 2023 22:36:24 +0100 Subject: [PATCH] Initial WebVTT parser implementation --- .tool-versions | 1 + src/glubs/webvtt.gleam | 94 ++++++++++++++++++++++++++++++++++++ test/glubs/webvtt_test.gleam | 37 +++++++++++++- 3 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 .tool-versions diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000..b434626 --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +gleam 0.32.4 diff --git a/src/glubs/webvtt.gleam b/src/glubs/webvtt.gleam index 2c61ec6..61d9eef 100644 --- a/src/glubs/webvtt.gleam +++ b/src/glubs/webvtt.gleam @@ -4,6 +4,100 @@ import gleam/result import gleam/list import gleam/int +pub type Cue { + Cue( + id: option.Option(String), + start_time: Int, + end_time: Int, + payload: String, + ) +} + +pub type WebVTT { + WebVTT(comment: option.Option(String), cues: List(Cue)) +} + +pub type ParserError { + ParserError(String) +} + +pub fn parse(webvtt: String) -> Result(WebVTT, ParserError) { + let [header, ..cues] = + webvtt + |> string.replace("\r\n", "\n") + |> string.split("\n\n") + + // TODO: Metadata still needs to be parsed + let [header, ..] = string.split(header, "\n") + + use comment <- result.try(parse_comment(header)) + use cues <- result.try(list.try_map(cues, parse_cue)) + + Ok(WebVTT(comment: comment, cues: cues)) +} + +pub fn to_string(_webvtt: WebVTT) -> String { + todo +} + +fn parse_comment(header: String) -> Result(option.Option(String), ParserError) { + case header { + "WEBVTT" -> Ok(None) + "WEBVTT\t" <> comment -> Ok(Some(comment)) + "WEBVTT " <> comment -> Ok(Some(comment)) + "WEBVTT" <> _other -> + Error(ParserError("Header comment must start with space or tab")) + _other -> Error(ParserError("Must start with \"WEBVTT\"")) + } +} + +fn parse_cue(cue: String) -> Result(Cue, ParserError) { + use #(id, rest) <- result.try(parse_cue_id(cue)) + + case string.split_once(rest, "\n") { + Ok(#(line, payload)) -> { + use #(start, end) <- result.try(parse_timestamps(line)) + Ok(Cue(id: id, payload: payload, start_time: start, end_time: end)) + } + Error(Nil) -> Error(ParserError("Invalid cue")) + } +} + +fn parse_cue_id( + cue: String, +) -> Result(#(option.Option(String), String), ParserError) { + case string.split_once(cue, "\n") { + Ok(#(id, rest)) -> { + case string.contains(id, "-->") { + True -> Ok(#(None, cue)) + False -> Ok(#(Some(id), rest)) + } + } + Error(Nil) -> Error(ParserError("Invalid cue")) + } +} + +fn parse_timestamps(line: String) -> Result(#(Int, Int), ParserError) { + case string.split(line, " --> ") { + [start, end] -> { + use start <- result.try( + start + |> parse_timestamp() + |> result.replace_error(ParserError("Invalid start timestamp")), + ) + + use end <- result.try( + end + |> parse_timestamp() + |> result.replace_error(ParserError("Invalid end timestamp")), + ) + + Ok(#(start, end)) + } + _other -> Error(ParserError("Invalid timestamp")) + } +} + pub type Token { StartTag( tag: String, diff --git a/test/glubs/webvtt_test.gleam b/test/glubs/webvtt_test.gleam index df404f6..a387302 100644 --- a/test/glubs/webvtt_test.gleam +++ b/test/glubs/webvtt_test.gleam @@ -1,6 +1,41 @@ import gleeunit/should import gleam/option.{None, Some} -import glubs/webvtt.{EndTag, StartTag, Text, Timestamp} +import glubs/webvtt.{Cue, EndTag, StartTag, Text, Timestamp, WebVTT} + +pub fn parse_invalid_header_test() { + "INVALID" + |> webvtt.parse() + |> should.equal(Error(webvtt.ParserError("Must start with \"WEBVTT\""))) +} + +pub fn parse_attached_header_test() { + "WEBVTTinvalid" + |> webvtt.parse() + |> should.equal(Error(webvtt.ParserError( + "Header comment must start with space or tab", + ))) +} + +pub fn parse_only_header_test() { + "WEBVTT" + |> webvtt.parse() + |> should.be_ok() +} + +pub fn parse_header_with_comment_test() { + "WEBVTT This is a comment" + |> webvtt.parse() + |> should.equal(Ok(WebVTT(comment: Some("This is a comment"), cues: []))) +} + +pub fn parse_cue_test() { + "WEBVTT\n\n1\n00:00.123 --> 00:00.456\nTest" + |> webvtt.parse() + |> should.equal(Ok(WebVTT( + comment: None, + cues: [Cue(id: Some("1"), start_time: 123, end_time: 456, payload: "Test")], + ))) +} pub fn tokenize_text_test() { "Hello"