Skip to content

Commit

Permalink
initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
ryochin committed Dec 30, 2023
0 parents commit bc9ddd4
Show file tree
Hide file tree
Showing 25 changed files with 705 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .formatter.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Used by "mix format"
[
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
]
66 changes: 66 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: Build precompiled NIFs

on:
push:
tags:
- 'v*'

jobs:
build_release:
name: NIF ${{ matrix.nif }} - ${{ matrix.job.target }} (${{ matrix.job.os }})
runs-on: ${{ matrix.job.os }}
strategy:
fail-fast: false
matrix:
nif: ["2.16", "2.15"]
job:
- { target: arm-unknown-linux-gnueabihf , os: ubuntu-20.04 , use-cross: true }
- { target: aarch64-unknown-linux-gnu , os: ubuntu-20.04 , use-cross: true }
- { target: aarch64-unknown-linux-musl , os: ubuntu-20.04 , use-cross: true }
- { target: aarch64-apple-darwin , os: macos-11 }
- { target: riscv64gc-unknown-linux-gnu , os: ubuntu-20.04 , use-cross: true }
- { target: x86_64-apple-darwin , os: macos-11 }
- { target: x86_64-unknown-linux-gnu , os: ubuntu-20.04 }
- { target: x86_64-unknown-linux-musl , os: ubuntu-20.04 , use-cross: true }
- { target: x86_64-pc-windows-gnu , os: windows-2019 }
- { target: x86_64-pc-windows-msvc , os: windows-2019 }

steps:
- name: Checkout source code
uses: actions/checkout@v3

- name: Extract project version
shell: bash
run: |
# Get the project version from mix.exs
echo "PROJECT_VERSION=$(sed -n 's/^ @version "\(.*\)"/\1/p' mix.exs | head -n1)" >> $GITHUB_ENV
- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable
with:
toolchain: stable
target: ${{ matrix.job.target }}

- name: Build the project
id: build-crate
uses: philss/[email protected]
with:
project-name: charset_detect
project-version: ${{ env.PROJECT_VERSION }}
target: ${{ matrix.job.target }}
nif-version: ${{ matrix.nif }}
use-cross: ${{ matrix.job.use-cross }}
project-dir: "native/charset_detect"

- name: Artifact upload
uses: actions/upload-artifact@v3
with:
name: ${{ steps.build-crate.outputs.file-name }}
path: ${{ steps.build-crate.outputs.file-path }}

- name: Publish archives and packages
uses: softprops/action-gh-release@v1
with:
files: |
${{ steps.build-crate.outputs.file-path }}
if: startsWith(github.ref, 'refs/tags/')
45 changes: 45 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
### Generated by gibo (https://github.com/simonwhitaker/gibo)
### https://raw.github.com/github/gitignore/218a941be92679ce67d0484547e3e142b2f5f6f0/Global/macOS.gitignore

# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### Generated by gibo (https://github.com/simonwhitaker/gibo)
### https://raw.github.com/github/gitignore/218a941be92679ce67d0484547e3e142b2f5f6f0/Elixir.gitignore

/_build
/cover
/deps
/doc
/.fetch
erl_crash.dump
*.ez
*.beam
/config/*.secret.exs
.elixir_ls/


/priv/native/*.so
Expand Down
8 changes: 8 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"cSpell.words": [
"chardetng",
"eucjp",
"PRECOMPILATION",
"sjis"
]
}
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 Ryo Okamoto

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
105 changes: 105 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
🌏 CharsetDetect: Guess character encoding for Elixir
=====================================================

[![Hex.pm](https://img.shields.io/hexpm/v/charset_detect.svg)](https://hex.pm/packages/charset_detect)
[![Hexdocs.pm](https://img.shields.io/badge/hex-docs-lightgreen.svg)](https://hexdocs.pm/charset_detect/)
[![Hex.pm](https://img.shields.io/hexpm/dt/charset_detect.svg)](https://hex.pm/packages/charset_detect)
[![License](https://img.shields.io/hexpm/l/charset_detect.svg)](https://github.com/ryochin/charset_detect/blob/main/LICENSE)

CharsetDetect is a simple wrapper around the [chardetng](https://crates.io/crates/chardetng) crate.

Usage
-----

Guess the encoding of a string:

```elixir
iex> File.read!("test/assets/sjis.txt") |> CharsetDetect.guess
{:ok, "Shift_JIS"}

iex> File.read!("test/assets/big5.txt") |> CharsetDetect.guess!
"Big5"
```

You might consider minimizing additional memory consumption.

```elixir
"... (long text) ..." |> String.slice(0, 1024) |> CharsetDetect.guess
```

Note that an ASCII string, including an empty string, will result in a `UTF-8` encoding rather than `ASCII`.

```elixir
iex> "hello world" |> CharsetDetect.guess
{:ok, "UTF-8"}
```

Strategies for implementing a conversion function
-------------------------------------------------

You can achieve conversion to any desired encoding using [iconv](https://hex.pm/packages/iconv).

```elixir
defmodule Converter do
def convert(text, to_encoding \\ "UTF-8") do
case text |> CharsetDetect.guess do
{:ok, ^to_encoding} ->
{:ok, text}
{:ok, encoding} ->
try do
{:ok, :iconv.convert(encoding, to_encoding, text)}
rescue
e in ArgumentError -> {:error, inspect(e)}
end
{:error, reason} ->
{:error, reason}
end
end
end
```
```elixir
iex> File.read!("test/assets/big5.txt") |> Converter.convert
{:ok, "大五碼是繁体中文(正體中文)社群最常用的電腦漢字字符集標準。\n"}
```

Installation
------------

The package can be installed by adding `charset_detect` to your list of dependencies in `mix.exs`:

```elixir
def deps do
[
{:charset_detect, "~> 0.1.0"}
]
end
```

Then, run `mix deps.get`.

Development
-----------

### Prerequisites

**Note:** This library requires the [Rust](https://www.rust-lang.org/) Toolchain for compilation.

Follow the instructions at [www.rust-lang.org/tools/install](https://www.rust-lang.org/tools/install) to install Rust.

Verify the installation by checking the `cargo` command version:

```sh
cargo --version
# Should output something like: cargo 1.68.1 (115f34552 2023-02-26)
```

Then, set the `RUSTLER_PRECOMPILATION_EXAMPLE_BUILD` environment variable to ensure that local sources are compiled instead of downloading a precompiled library file.

```sh
RUSTLER_PRECOMPILATION_EXAMPLE_BUILD=1 mix compile
```

License
-------

The MIT License
53 changes: 53 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
version: '3'

vars:
PROJECT_NAME: CharsetDetect

tasks:
compile:
desc: Compile sources (with Rust)
env:
RUSTLER_PRECOMPILATION_EXAMPLE_BUILD: 1
cmds:
- mix compile

compile-using-precompiled-binaries:
desc: Compile sources (without Rust)
cmds:
- mix compile

console:
desc: Open console
env:
RUSTLER_PRECOMPILATION_EXAMPLE_BUILD: 1
cmds:
- cmd: iex -S mix
ignore_error: true

test:
desc: Run tests (with Rust)
env:
RUSTLER_PRECOMPILATION_EXAMPLE_BUILD: 1
cmds:
- mix test --trace

test-using-precompiled-binaries:
desc: Run tests (without Rust)
cmds:
- mix test --trace

clean:
desc: Clean up
cmds:
- mix clean
- rm -f priv/native/*.so

rustler-precompiled-download:
desc: Download precompiled files info
cmds:
- "mix rustler_precompiled.download {{.PROJECT_NAME}} --all --print"

default:
cmds:
- task -l --sort=none
silent: true
51 changes: 51 additions & 0 deletions lib/charset_detect.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
defmodule CharsetDetect do
@moduledoc """
CharsetDetect is a simple wrapper around the chardetng crate.
"""

version = Mix.Project.config()[:version]

use RustlerPrecompiled,
otp_app: :charset_detect,
crate: "charset_detect",
base_url: "https://github.com/ryochin/charset_detect/releases/download/v#{version}",
force_build: System.get_env("RUSTLER_PRECOMPILATION_EXAMPLE_BUILD") in ["1", "true"],
version: version

@doc """
Guess the encoding of a string.
## Examples
iex> File.read!("test/assets/sjis.txt") |> CharsetDetect.guess
{:ok, "Shift_JIS"}
"""
@spec guess(binary) :: {:ok, String.t()} | {:error, String.t()}
def guess(body) when is_binary(body) do
case _guess(body) do
result when is_binary(result) -> {:ok, result}
{:error, reason} -> {:error, reason}
end
end

def guess(_), do: {:error, "invalid argument"}

@doc """
Guess the encoding of a string (exceptional).
## Examples
iex> File.read!("test/assets/big5.txt") |> CharsetDetect.guess!
"Big5"
"""
@spec guess!(binary) :: String.t()
def guess!(body) when is_binary(body) do
{:ok, result} = guess(body)

result
end

# NIF function definition
@spec _guess(binary) :: String.t() | {:error, String.t()}
def _guess(_body), do: :erlang.nif_error(:nif_not_loaded)
end
Loading

0 comments on commit bc9ddd4

Please sign in to comment.