diff --git a/README.md b/README.md index 5c8a542..cbfef3f 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,9 @@ -# glubs +# glubs - Subtitle parser [![Package Version](https://img.shields.io/hexpm/v/glubs)](https://hex.pm/packages/glubs) [![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/glubs/) -## Quick start - -```sh -gleam run # Run the project -gleam test # Run the tests -gleam shell # Run an Erlang shell -``` +glubs (gleam subtitles) is a WebVTT (and in the future maybe SRT) parser written in Gleam, designed to parse WebVTT files and provide a structured representation of the content. ## Installation @@ -20,3 +14,69 @@ gleam add glubs ``` and its documentation can be found at . + +## Features + +* [x] Parses WebVTT files into a structured format +* [x] Handles both comments and cues with start and end times +* [x] Tokenizes WebVTT cue payload into individual tokens +* [ ] Converts a WebVTT type back to a string +* [ ] Converts a list of tokens type back to a string + +## Example + +```gleam +import glubs/webvtt.{Cue, EndTag, Note, StartTag, Text, WebVTT} +import gleam/option.{None, Some} +import simplifile + +pub fn main() { + // WebVTT parser + let assert Ok(content) = simplifile.read("test/fixtures/comments.vtt") + let assert Ok(result) = webvtt.parse(content) + + let assert WebVTT( + comment: Some("- Translation of that film I like"), + items: [ + Note( + "This translation was done by Kyle so that\nsome friends can watch it with their parents.", + ), + Cue( + id: Some("1"), + start_time: 135_000, + end_time: 140_000, + payload: "- Ta en kopp varmt te.\n- Det är inte varmt.", + ), + Cue( + id: Some("2"), + start_time: 140_000, + end_time: 145_000, + payload: "- Har en kopp te.\n- Det smakar som te.", + ), + Note("This last line may not translate well."), + Cue( + id: Some("3"), + start_time: 145_000, + end_time: 150_000, + payload: "- Ta en kopp", + ), + ], + ) = result + + // Cue payload tokenizer + let assert Ok(tokens) = + "Hi!\nHello mate!" + |> webvtt.tokenize() + + let assert [ + StartTag("v", classes: [], annotation: Some("Phil")), + Text("Hi!\n"), + StartTag("v", classes: ["loud", "shout"], annotation: Some("Rob")), + Text("Hello "), + StartTag("i", classes: [], annotation: None), + Text("mate!"), + EndTag("i"), + EndTag("v"), + ] = tokens +} +``` diff --git a/src/glubs/webvtt.gleam b/src/glubs/webvtt.gleam index 70072b0..5b126ef 100644 --- a/src/glubs/webvtt.gleam +++ b/src/glubs/webvtt.gleam @@ -1,28 +1,22 @@ -import gleam/option.{None, Some} +import gleam/option.{None, Option, Some} import gleam/string import gleam/result import gleam/list import gleam/int +/// Item represents an individual item in a WebVTT file, which can be either a Note or a Cue. pub type Item { Note(String) - Cue( - id: option.Option(String), - start_time: Int, - end_time: Int, - payload: String, - ) + Cue(id: Option(String), start_time: Int, end_time: Int, payload: String) } +/// Represents a WebVTT file with an optional comment and a list of items. pub type WebVTT { - WebVTT(comment: option.Option(String), items: List(Item)) + WebVTT(comment: Option(String), items: List(Item)) } -pub type ParserError { - ParserError(String) -} - -pub fn parse(webvtt: String) -> Result(WebVTT, ParserError) { +// Parses a WebVTT string and returns a Result containing the parsed WebVTT structure or a parsing error. +pub fn parse(webvtt: String) -> Result(WebVTT, String) { let [header, ..cues] = webvtt |> string.replace("\r\n", "\n") @@ -38,32 +32,31 @@ pub fn parse(webvtt: String) -> Result(WebVTT, ParserError) { Ok(WebVTT(comment: comment, items: items)) } -fn parse_comment(header: String) -> Result(option.Option(String), ParserError) { +fn parse_comment(header: String) -> Result(Option(String), String) { case header { "WEBVTT" -> Ok(None) "WEBVTT\t" <> comment -> Ok(Some(comment)) "WEBVTT " <> comment -> Ok(Some(comment)) - "WEBVTT" <> _other -> - Error(ParserError("Header comment must start with space or tab")) - _other -> Error(ParserError("Must start with \"WEBVTT\"")) + "WEBVTT" <> _other -> Error("Header comment must start with space or tab") + _other -> Error("Must start with \"WEBVTT\"") } } -fn parse_item(item: String) -> Result(Item, ParserError) { +fn parse_item(item: String) -> Result(Item, String) { item |> parse_note() |> result.try_recover(fn(_) { parse_cue(item) }) } -fn parse_note(note: String) -> Result(Item, ParserError) { +fn parse_note(note: String) -> Result(Item, String) { case note { "NOTE\n" <> note -> Ok(Note(note)) "NOTE " <> note -> Ok(Note(note)) - _other -> Error(ParserError("Invalid note")) + _other -> Error("Invalid note") } } -fn parse_cue(cue: String) -> Result(Item, ParserError) { +fn parse_cue(cue: String) -> Result(Item, String) { use #(id, rest) <- result.try(parse_cue_id(cue)) case string.split_once(rest, "\n") { @@ -71,13 +64,11 @@ fn parse_cue(cue: String) -> Result(Item, ParserError) { use #(start, end) <- result.try(parse_timestamps(line)) Ok(Cue(id: id, payload: payload, start_time: start, end_time: end)) } - Error(Nil) -> Error(ParserError("Invalid cue")) + Error(Nil) -> Error("Invalid cue") } } -fn parse_cue_id( - cue: String, -) -> Result(#(option.Option(String), String), ParserError) { +fn parse_cue_id(cue: String) -> Result(#(Option(String), String), String) { case string.split_once(cue, "\n") { Ok(#(id, rest)) -> { case string.contains(id, "-->") { @@ -85,42 +76,40 @@ fn parse_cue_id( False -> Ok(#(Some(id), rest)) } } - Error(Nil) -> Error(ParserError("Invalid cue")) + Error(Nil) -> Error("Invalid cue") } } -fn parse_timestamps(line: String) -> Result(#(Int, Int), ParserError) { +fn parse_timestamps(line: String) -> Result(#(Int, Int), String) { case string.split(line, " --> ") { [start, end] -> { use start <- result.try( start |> parse_timestamp() - |> result.replace_error(ParserError("Invalid start timestamp")), + |> result.replace_error("Invalid start timestamp"), ) use end <- result.try( end |> parse_timestamp() - |> result.replace_error(ParserError("Invalid end timestamp")), + |> result.replace_error("Invalid end timestamp"), ) Ok(#(start, end)) } - _other -> Error(ParserError("Invalid timestamp")) + _other -> Error("Invalid timestamp") } } +/// Token represents individual tokens that can be generated during the tokenization of WebVTT cue payload. pub type Token { - StartTag( - tag: String, - classes: List(String), - annotation: option.Option(String), - ) + StartTag(tag: String, classes: List(String), annotation: Option(String)) Text(content: String) Timestamp(ms: Int) EndTag(tag: String) } +/// TokenizationError represents errors that may occur during the tokenization process. pub type TokenizationError { InvalidStartToken InvalidEndToken @@ -132,6 +121,7 @@ pub fn tokenize(payload: String) -> Result(List(Token), TokenizationError) { |> result.map(list.reverse) } +/// Tokenizes the given cue payload and returns a Result containing the list of generated tokens or a tokenization error. fn do_tokenize( payload: String, acc: List(Token), diff --git a/test/glubs/webvtt_test.gleam b/test/glubs/webvtt_test.gleam index 923f29a..6004f29 100644 --- a/test/glubs/webvtt_test.gleam +++ b/test/glubs/webvtt_test.gleam @@ -6,15 +6,13 @@ import simplifile pub fn parse_invalid_header_test() { "INVALID" |> webvtt.parse() - |> should.equal(Error(webvtt.ParserError("Must start with \"WEBVTT\""))) + |> should.equal(Error("Must start with \"WEBVTT\"")) } pub fn parse_attached_header_test() { "WEBVTTinvalid" |> webvtt.parse() - |> should.equal(Error(webvtt.ParserError( - "Header comment must start with space or tab", - ))) + |> should.equal(Error("Header comment must start with space or tab")) } pub fn parse_only_header_test() {