Skip to content

Commit

Permalink
Add README
Browse files Browse the repository at this point in the history
  • Loading branch information
philipgiuliani committed Nov 18, 2023
1 parent 4e7d28e commit 855a16c
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 47 deletions.
76 changes: 68 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
# glubs
# glubs - Subtitle parser

[![Package Version](https://img.shields.io/hexpm/v/glubs)](https://hex.pm/packages/glubs)
[![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/glubs/)

## Quick start

```sh
gleam run # Run the project
gleam test # Run the tests
gleam shell # Run an Erlang shell
```
glubs (gleam subtitles) is a WebVTT (and in the future maybe SRT) parser written in Gleam, designed to parse WebVTT files and provide a structured representation of the content.

## Installation

Expand All @@ -20,3 +14,69 @@ gleam add glubs
```

and its documentation can be found at <https://hexdocs.pm/glubs>.

## Features

* [x] Parses WebVTT files into a structured format
* [x] Handles both comments and cues with start and end times
* [x] Tokenizes WebVTT cue payload into individual tokens
* [ ] Converts a WebVTT type back to a string
* [ ] Converts a list of tokens type back to a string

## Example

```gleam
import glubs/webvtt.{Cue, EndTag, Note, StartTag, Text, WebVTT}
import gleam/option.{None, Some}
import simplifile
pub fn main() {
// WebVTT parser
let assert Ok(content) = simplifile.read("test/fixtures/comments.vtt")
let assert Ok(result) = webvtt.parse(content)
let assert WebVTT(
comment: Some("- Translation of that film I like"),
items: [
Note(
"This translation was done by Kyle so that\nsome friends can watch it with their parents.",
),
Cue(
id: Some("1"),
start_time: 135_000,
end_time: 140_000,
payload: "- Ta en kopp varmt te.\n- Det är inte varmt.",
),
Cue(
id: Some("2"),
start_time: 140_000,
end_time: 145_000,
payload: "- Har en kopp te.\n- Det smakar som te.",
),
Note("This last line may not translate well."),
Cue(
id: Some("3"),
start_time: 145_000,
end_time: 150_000,
payload: "- Ta en kopp",
),
],
) = result
// Cue payload tokenizer
let assert Ok(tokens) =
"<v Phil>Hi!\n<v.loud.shout Rob>Hello <i>mate!</i></v>"
|> webvtt.tokenize()
let assert [
StartTag("v", classes: [], annotation: Some("Phil")),
Text("Hi!\n"),
StartTag("v", classes: ["loud", "shout"], annotation: Some("Rob")),
Text("Hello "),
StartTag("i", classes: [], annotation: None),
Text("mate!"),
EndTag("i"),
EndTag("v"),
] = tokens
}
```
60 changes: 25 additions & 35 deletions src/glubs/webvtt.gleam
Original file line number Diff line number Diff line change
@@ -1,28 +1,22 @@
import gleam/option.{None, Some}
import gleam/option.{None, Option, Some}
import gleam/string
import gleam/result
import gleam/list
import gleam/int

/// Item represents an individual item in a WebVTT file, which can be either a Note or a Cue.
pub type Item {
Note(String)
Cue(
id: option.Option(String),
start_time: Int,
end_time: Int,
payload: String,
)
Cue(id: Option(String), start_time: Int, end_time: Int, payload: String)
}

/// Represents a WebVTT file with an optional comment and a list of items.
pub type WebVTT {
WebVTT(comment: option.Option(String), items: List(Item))
WebVTT(comment: Option(String), items: List(Item))
}

pub type ParserError {
ParserError(String)
}

pub fn parse(webvtt: String) -> Result(WebVTT, ParserError) {
// Parses a WebVTT string and returns a Result containing the parsed WebVTT structure or a parsing error.
pub fn parse(webvtt: String) -> Result(WebVTT, String) {
let [header, ..cues] =
webvtt
|> string.replace("\r\n", "\n")
Expand All @@ -38,89 +32,84 @@ pub fn parse(webvtt: String) -> Result(WebVTT, ParserError) {
Ok(WebVTT(comment: comment, items: items))
}

fn parse_comment(header: String) -> Result(option.Option(String), ParserError) {
fn parse_comment(header: String) -> Result(Option(String), String) {
case header {
"WEBVTT" -> Ok(None)
"WEBVTT\t" <> comment -> Ok(Some(comment))
"WEBVTT " <> comment -> Ok(Some(comment))
"WEBVTT" <> _other ->
Error(ParserError("Header comment must start with space or tab"))
_other -> Error(ParserError("Must start with \"WEBVTT\""))
"WEBVTT" <> _other -> Error("Header comment must start with space or tab")
_other -> Error("Must start with \"WEBVTT\"")
}
}

fn parse_item(item: String) -> Result(Item, ParserError) {
fn parse_item(item: String) -> Result(Item, String) {
item
|> parse_note()
|> result.try_recover(fn(_) { parse_cue(item) })
}

fn parse_note(note: String) -> Result(Item, ParserError) {
fn parse_note(note: String) -> Result(Item, String) {
case note {
"NOTE\n" <> note -> Ok(Note(note))
"NOTE " <> note -> Ok(Note(note))
_other -> Error(ParserError("Invalid note"))
_other -> Error("Invalid note")
}
}

fn parse_cue(cue: String) -> Result(Item, ParserError) {
fn parse_cue(cue: String) -> Result(Item, String) {
use #(id, rest) <- result.try(parse_cue_id(cue))

case string.split_once(rest, "\n") {
Ok(#(line, payload)) -> {
use #(start, end) <- result.try(parse_timestamps(line))
Ok(Cue(id: id, payload: payload, start_time: start, end_time: end))
}
Error(Nil) -> Error(ParserError("Invalid cue"))
Error(Nil) -> Error("Invalid cue")
}
}

fn parse_cue_id(
cue: String,
) -> Result(#(option.Option(String), String), ParserError) {
fn parse_cue_id(cue: String) -> Result(#(Option(String), String), String) {
case string.split_once(cue, "\n") {
Ok(#(id, rest)) -> {
case string.contains(id, "-->") {
True -> Ok(#(None, cue))
False -> Ok(#(Some(id), rest))
}
}
Error(Nil) -> Error(ParserError("Invalid cue"))
Error(Nil) -> Error("Invalid cue")
}
}

fn parse_timestamps(line: String) -> Result(#(Int, Int), ParserError) {
fn parse_timestamps(line: String) -> Result(#(Int, Int), String) {
case string.split(line, " --> ") {
[start, end] -> {
use start <- result.try(
start
|> parse_timestamp()
|> result.replace_error(ParserError("Invalid start timestamp")),
|> result.replace_error("Invalid start timestamp"),
)

use end <- result.try(
end
|> parse_timestamp()
|> result.replace_error(ParserError("Invalid end timestamp")),
|> result.replace_error("Invalid end timestamp"),
)

Ok(#(start, end))
}
_other -> Error(ParserError("Invalid timestamp"))
_other -> Error("Invalid timestamp")
}
}

/// Token represents individual tokens that can be generated during the tokenization of WebVTT cue payload.
pub type Token {
StartTag(
tag: String,
classes: List(String),
annotation: option.Option(String),
)
StartTag(tag: String, classes: List(String), annotation: Option(String))
Text(content: String)
Timestamp(ms: Int)
EndTag(tag: String)
}

/// TokenizationError represents errors that may occur during the tokenization process.
pub type TokenizationError {
InvalidStartToken
InvalidEndToken
Expand All @@ -132,6 +121,7 @@ pub fn tokenize(payload: String) -> Result(List(Token), TokenizationError) {
|> result.map(list.reverse)
}

/// Tokenizes the given cue payload and returns a Result containing the list of generated tokens or a tokenization error.
fn do_tokenize(
payload: String,
acc: List(Token),
Expand Down
6 changes: 2 additions & 4 deletions test/glubs/webvtt_test.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@ import simplifile
pub fn parse_invalid_header_test() {
"INVALID"
|> webvtt.parse()
|> should.equal(Error(webvtt.ParserError("Must start with \"WEBVTT\"")))
|> should.equal(Error("Must start with \"WEBVTT\""))
}

pub fn parse_attached_header_test() {
"WEBVTTinvalid"
|> webvtt.parse()
|> should.equal(Error(webvtt.ParserError(
"Header comment must start with space or tab",
)))
|> should.equal(Error("Header comment must start with space or tab"))
}

pub fn parse_only_header_test() {
Expand Down

0 comments on commit 855a16c

Please sign in to comment.