Add README

philipgiuliani · Nov 18, 2023 · 855a16c · 855a16c
1 parent 4e7d28e
commit 855a16c
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -1,15 +1,9 @@
-# glubs
+# glubs - Subtitle parser
 
 [![Package Version](https://img.shields.io/hexpm/v/glubs)](https://hex.pm/packages/glubs)
 [![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/glubs/)
 
-## Quick start
-
-```sh
-gleam run   # Run the project
-gleam test  # Run the tests
-gleam shell # Run an Erlang shell
-```
+glubs (gleam subtitles) is a WebVTT (and in the future maybe SRT) parser written in Gleam, designed to parse WebVTT files and provide a structured representation of the content.
 
 ## Installation
 
@@ -20,3 +14,69 @@ gleam add glubs
 ```
 
 and its documentation can be found at <https://hexdocs.pm/glubs>.
+
+## Features
+
+* [x] Parses WebVTT files into a structured format
+* [x] Handles both comments and cues with start and end times
+* [x] Tokenizes WebVTT cue payload into individual tokens
+* [ ] Converts a WebVTT type back to a string
+* [ ] Converts a list of tokens type back to a string
+
+## Example
+
+```gleam
+import glubs/webvtt.{Cue, EndTag, Note, StartTag, Text, WebVTT}
+import gleam/option.{None, Some}
+import simplifile
+
+pub fn main() {
+  // WebVTT parser
+  let assert Ok(content) = simplifile.read("test/fixtures/comments.vtt")
+  let assert Ok(result) = webvtt.parse(content)
+
+  let assert WebVTT(
+    comment: Some("- Translation of that film I like"),
+    items: [
+      Note(
+        "This translation was done by Kyle so that\nsome friends can watch it with their parents.",
+      ),
+      Cue(
+        id: Some("1"),
+        start_time: 135_000,
+        end_time: 140_000,
+        payload: "- Ta en kopp varmt te.\n- Det är inte varmt.",
+      ),
+      Cue(
+        id: Some("2"),
+        start_time: 140_000,
+        end_time: 145_000,
+        payload: "- Har en kopp te.\n- Det smakar som te.",
+      ),
+      Note("This last line may not translate well."),
+      Cue(
+        id: Some("3"),
+        start_time: 145_000,
+        end_time: 150_000,
+        payload: "- Ta en kopp",
+      ),
+    ],
+  ) = result
+
+  // Cue payload tokenizer
+  let assert Ok(tokens) =
+    "<v Phil>Hi!\n<v.loud.shout Rob>Hello <i>mate!</i></v>"
+    |> webvtt.tokenize()
+
+  let assert [
+    StartTag("v", classes: [], annotation: Some("Phil")),
+    Text("Hi!\n"),
+    StartTag("v", classes: ["loud", "shout"], annotation: Some("Rob")),
+    Text("Hello "),
+    StartTag("i", classes: [], annotation: None),
+    Text("mate!"),
+    EndTag("i"),
+    EndTag("v"),
+  ] = tokens
+}
+```
diff --git a/src/glubs/webvtt.gleam b/src/glubs/webvtt.gleam
@@ -1,28 +1,22 @@
-import gleam/option.{None, Some}
+import gleam/option.{None, Option, Some}
 import gleam/string
 import gleam/result
 import gleam/list
 import gleam/int
 
+/// Item represents an individual item in a WebVTT file, which can be either a Note or a Cue.
 pub type Item {
   Note(String)
-  Cue(
-    id: option.Option(String),
-    start_time: Int,
-    end_time: Int,
-    payload: String,
-  )
+  Cue(id: Option(String), start_time: Int, end_time: Int, payload: String)
 }
 
+/// Represents a WebVTT file with an optional comment and a list of items.
 pub type WebVTT {
-  WebVTT(comment: option.Option(String), items: List(Item))
+  WebVTT(comment: Option(String), items: List(Item))
 }
 
-pub type ParserError {
-  ParserError(String)
-}
-
-pub fn parse(webvtt: String) -> Result(WebVTT, ParserError) {
+// Parses a WebVTT string and returns a Result containing the parsed WebVTT structure or a parsing error.
+pub fn parse(webvtt: String) -> Result(WebVTT, String) {
   let [header, ..cues] =
     webvtt
     |> string.replace("\r\n", "\n")
@@ -38,89 +32,84 @@ pub fn parse(webvtt: String) -> Result(WebVTT, ParserError) {
   Ok(WebVTT(comment: comment, items: items))
 }
 
-fn parse_comment(header: String) -> Result(option.Option(String), ParserError) {
+fn parse_comment(header: String) -> Result(Option(String), String) {
   case header {
     "WEBVTT" -> Ok(None)
     "WEBVTT\t" <> comment -> Ok(Some(comment))
     "WEBVTT " <> comment -> Ok(Some(comment))
-    "WEBVTT" <> _other ->
-      Error(ParserError("Header comment must start with space or tab"))
-    _other -> Error(ParserError("Must start with \"WEBVTT\""))
+    "WEBVTT" <> _other -> Error("Header comment must start with space or tab")
+    _other -> Error("Must start with \"WEBVTT\"")
   }
 }
 
-fn parse_item(item: String) -> Result(Item, ParserError) {
+fn parse_item(item: String) -> Result(Item, String) {
   item
   |> parse_note()
   |> result.try_recover(fn(_) { parse_cue(item) })
 }
 
-fn parse_note(note: String) -> Result(Item, ParserError) {
+fn parse_note(note: String) -> Result(Item, String) {
   case note {
     "NOTE\n" <> note -> Ok(Note(note))
     "NOTE " <> note -> Ok(Note(note))
-    _other -> Error(ParserError("Invalid note"))
+    _other -> Error("Invalid note")
   }
 }
 
-fn parse_cue(cue: String) -> Result(Item, ParserError) {
+fn parse_cue(cue: String) -> Result(Item, String) {
   use #(id, rest) <- result.try(parse_cue_id(cue))
 
   case string.split_once(rest, "\n") {
     Ok(#(line, payload)) -> {
       use #(start, end) <- result.try(parse_timestamps(line))
       Ok(Cue(id: id, payload: payload, start_time: start, end_time: end))
     }
-    Error(Nil) -> Error(ParserError("Invalid cue"))
+    Error(Nil) -> Error("Invalid cue")
   }
 }
 
-fn parse_cue_id(
-  cue: String,
-) -> Result(#(option.Option(String), String), ParserError) {
+fn parse_cue_id(cue: String) -> Result(#(Option(String), String), String) {
   case string.split_once(cue, "\n") {
     Ok(#(id, rest)) -> {
       case string.contains(id, "-->") {
         True -> Ok(#(None, cue))
         False -> Ok(#(Some(id), rest))
       }
     }
-    Error(Nil) -> Error(ParserError("Invalid cue"))
+    Error(Nil) -> Error("Invalid cue")
   }
 }
 
-fn parse_timestamps(line: String) -> Result(#(Int, Int), ParserError) {
+fn parse_timestamps(line: String) -> Result(#(Int, Int), String) {
   case string.split(line, " --> ") {
     [start, end] -> {
       use start <- result.try(
         start
         |> parse_timestamp()
-        |> result.replace_error(ParserError("Invalid start timestamp")),
+        |> result.replace_error("Invalid start timestamp"),
       )
 
       use end <- result.try(
         end
         |> parse_timestamp()
-        |> result.replace_error(ParserError("Invalid end timestamp")),
+        |> result.replace_error("Invalid end timestamp"),
       )
 
       Ok(#(start, end))
     }
-    _other -> Error(ParserError("Invalid timestamp"))
+    _other -> Error("Invalid timestamp")
   }
 }
 
+/// Token represents individual tokens that can be generated during the tokenization of WebVTT cue payload.
 pub type Token {
-  StartTag(
-    tag: String,
-    classes: List(String),
-    annotation: option.Option(String),
-  )
+  StartTag(tag: String, classes: List(String), annotation: Option(String))
   Text(content: String)
   Timestamp(ms: Int)
   EndTag(tag: String)
 }
 
+/// TokenizationError represents errors that may occur during the tokenization process.
 pub type TokenizationError {
   InvalidStartToken
   InvalidEndToken
@@ -132,6 +121,7 @@ pub fn tokenize(payload: String) -> Result(List(Token), TokenizationError) {
   |> result.map(list.reverse)
 }
 
+/// Tokenizes the given cue payload and returns a Result containing the list of generated tokens or a tokenization error.
 fn do_tokenize(
   payload: String,
   acc: List(Token),

diff --git a/test/glubs/webvtt_test.gleam b/test/glubs/webvtt_test.gleam
@@ -6,15 +6,13 @@ import simplifile
 pub fn parse_invalid_header_test() {
   "INVALID"
   |> webvtt.parse()
-  |> should.equal(Error(webvtt.ParserError("Must start with \"WEBVTT\"")))
+  |> should.equal(Error("Must start with \"WEBVTT\""))
 }
 
 pub fn parse_attached_header_test() {
   "WEBVTTinvalid"
   |> webvtt.parse()
-  |> should.equal(Error(webvtt.ParserError(
-    "Header comment must start with space or tab",
-  )))
+  |> should.equal(Error("Header comment must start with space or tab"))
 }
 
 pub fn parse_only_header_test() {