Skip to content

Commit

Permalink
Initial WebVTT parser implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
philipgiuliani committed Nov 17, 2023
1 parent b2a9624 commit ceff768
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 1 deletion.
1 change: 1 addition & 0 deletions .tool-versions
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
gleam 0.32.4
94 changes: 94 additions & 0 deletions src/glubs/webvtt.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,100 @@ import gleam/result
import gleam/list
import gleam/int

pub type Cue {
Cue(
id: option.Option(String),
start_time: Int,
end_time: Int,
payload: String,
)
}

pub type WebVTT {
WebVTT(comment: option.Option(String), cues: List(Cue))
}

pub type ParserError {
ParserError(String)
}

pub fn parse(webvtt: String) -> Result(WebVTT, ParserError) {
let [header, ..cues] =
webvtt
|> string.replace("\r\n", "\n")
|> string.split("\n\n")

// TODO: Metadata still needs to be parsed
let [header, ..] = string.split(header, "\n")

use comment <- result.try(parse_comment(header))
use cues <- result.try(list.try_map(cues, parse_cue))

Ok(WebVTT(comment: comment, cues: cues))
}

pub fn to_string(_webvtt: WebVTT) -> String {
todo
}

fn parse_comment(header: String) -> Result(option.Option(String), ParserError) {
case header {
"WEBVTT" -> Ok(None)
"WEBVTT\t" <> comment -> Ok(Some(comment))
"WEBVTT " <> comment -> Ok(Some(comment))
"WEBVTT" <> _other ->
Error(ParserError("Header comment must start with space or tab"))
_other -> Error(ParserError("Must start with \"WEBVTT\""))
}
}

fn parse_cue(cue: String) -> Result(Cue, ParserError) {
use #(id, rest) <- result.try(parse_cue_id(cue))

case string.split_once(rest, "\n") {
Ok(#(line, payload)) -> {
use #(start, end) <- result.try(parse_timestamps(line))
Ok(Cue(id: id, payload: payload, start_time: start, end_time: end))
}
Error(Nil) -> Error(ParserError("Invalid cue"))
}
}

fn parse_cue_id(
cue: String,
) -> Result(#(option.Option(String), String), ParserError) {
case string.split_once(cue, "\n") {
Ok(#(id, rest)) -> {
case string.contains(id, "-->") {
True -> Ok(#(None, cue))
False -> Ok(#(Some(id), rest))
}
}
Error(Nil) -> Error(ParserError("Invalid cue"))
}
}

fn parse_timestamps(line: String) -> Result(#(Int, Int), ParserError) {
case string.split(line, " --> ") {
[start, end] -> {
use start <- result.try(
start
|> parse_timestamp()
|> result.replace_error(ParserError("Invalid start timestamp")),
)

use end <- result.try(
end
|> parse_timestamp()
|> result.replace_error(ParserError("Invalid end timestamp")),
)

Ok(#(start, end))
}
_other -> Error(ParserError("Invalid timestamp"))
}
}

pub type Token {
StartTag(
tag: String,
Expand Down
37 changes: 36 additions & 1 deletion test/glubs/webvtt_test.gleam
Original file line number Diff line number Diff line change
@@ -1,6 +1,41 @@
import gleeunit/should
import gleam/option.{None, Some}
import glubs/webvtt.{EndTag, StartTag, Text, Timestamp}
import glubs/webvtt.{Cue, EndTag, StartTag, Text, Timestamp, WebVTT}

pub fn parse_invalid_header_test() {
"INVALID"
|> webvtt.parse()
|> should.equal(Error(webvtt.ParserError("Must start with \"WEBVTT\"")))
}

pub fn parse_attached_header_test() {
"WEBVTTinvalid"
|> webvtt.parse()
|> should.equal(Error(webvtt.ParserError(
"Header comment must start with space or tab",
)))
}

pub fn parse_only_header_test() {
"WEBVTT"
|> webvtt.parse()
|> should.be_ok()
}

pub fn parse_header_with_comment_test() {
"WEBVTT This is a comment"
|> webvtt.parse()
|> should.equal(Ok(WebVTT(comment: Some("This is a comment"), cues: [])))
}

pub fn parse_cue_test() {
"WEBVTT\n\n1\n00:00.123 --> 00:00.456\nTest"
|> webvtt.parse()
|> should.equal(Ok(WebVTT(
comment: None,
cues: [Cue(id: Some("1"), start_time: 123, end_time: 456, payload: "Test")],
)))
}

pub fn tokenize_text_test() {
"Hello"
Expand Down

0 comments on commit ceff768

Please sign in to comment.