diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
new file mode 100644
index 0000000..4418624
--- /dev/null
+++ b/.github/workflows/CI.yml
@@ -0,0 +1,44 @@
+name: CI
+on:
+  push:
+    branches:
+      - main
+    tags: '*'
+  pull_request:
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - '1.6'
+          - '1'
+          - 'nightly'
+        os:
+          - ubuntu-latest
+        arch:
+          - x64
+        include:
+          - os: windows-latest
+            julia-version: '1'
+            arch: x64
+          - os: macOS-latest
+            julia-version: '1'
+            arch: x64
+          - os: ubuntu-latest
+            julia-version: '1'
+            arch: x86
+    steps:
+      - uses: actions/checkout@v3
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/cache@v1
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 0000000..f49313b
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,15 @@
+name: TagBot
+on:
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
+jobs:
+  TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ba39cc5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+Manifest.toml
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e5ff218
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+Copyright (c) 2022 Suzhou-Tongyuan (support@tongyuan.cc)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Project.toml b/Project.toml
new file mode 100644
index 0000000..ebfda3c
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,13 @@
+name = "UnzipLoops"
+uuid = "65164825-a06a-491c-beb8-9961b1a95625"
+version = "0.1.0"
+
+[compat]
+julia = "1.6"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+
+[targets]
+test = ["Test", "OffsetArrays"]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4c5a85e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,114 @@
+# UnzipLoops
+
+This package provides one single function `broadcast_unzip(f, As...)` that works similar to `map(f,
+As...)` but outputs different data layout.
+
+## Example
+
+Broadcasting is very useful in Julia. When you use broadcasting, you'll definitely meet cases like
+this:
+
+```julia
+f(x, y) = x^y, x/y
+
+X, Y = [1, 2, 3, 4], [4, 3, 2, 1]
+out = f.(X, Y)
+```
+
+This `out` is of type `Vector{Tuple{Int,Int}}`. It's often the case that we'll need to unzip it to
+`Tuple{Vector{Int}, Vector{Int}}`. For most of the case, we can trivially split it apart:
+
+```julia
+function g(X, Y)
+    out = f.(X, Y)
+    return getindex.(out, 1), getindex.(out, 2)
+end
+```
+
+In this case, since this requires two more broadcasting, it introduces some unnecessary overhead:
+
+- `f.(X, Y)` allocates one extra memory to store the intermediate result, and
+- the entire function requires two more loops to do the work and thus hurts the cache locality --
+  locality matters for low-level performance optimization.
+
+```julia
+X, Y = rand(1:5, 1024), rand(1:5, 1024)
+@btime f.($X, $Y) # 3.834 μs (1 allocation: 16.12 KiB)
+@btime g($X, $Y) # 5.388 μs (4 allocations: 32.41 KiB)
+```
+
+We can observe 1.5μs overhead here.
+
+A more optimized version for this specific `f` is to pre-allocate the output result, and do
+everything in one single loop. For instance
+
+```julia
+function g(X, Y)
+    @assert size(X) == size(Y)
+    Base.require_one_based_indexing(X, Y)
+
+    T = promote_type(Float64, eltype(X), eltype(Y))
+    N = ndims(X)
+    Z1 = Array{T,N}(undef, size(X))
+    Z2 = Array{T,N}(undef, size(X))
+    @inbounds @simd for i in eachindex(X)
+        v = f(X[i], Y[i])
+        Z1[i] = v[1]
+        Z2[i] = v[2]
+    end
+    return Z1, Z2
+end
+```
+
+We usually call this `g` the SoA (struct of array) layout. Comparing to the AoS (array of struct)
+layout that has only one contiguous array involved, this SoA layout introduces two contiguous
+arrays. Thus you'll still notice some overhead here compared to the plain `f.(X, Y)` kernel:
+
+```julia
+@btime g($X, $Y) # 3.999 μs (2 allocations: 16.25 KiB)
+```
+
+but the overhead is relatively small here -- we eliminate the extra loops and allocations. From case
+to case, Julia compiler can optimze this difference away.
+
+## `broadcast_unzip`
+
+Obviously, rewriting the trivial `getindex` solution into the verbose manual loop introduces many
+work and hurts the readability. This is why `broadcast_unzip` is introduced -- it's a combination of
+broadcasting and unzip. Most importantly, this is simple to use and yet fast:
+
+```julia
+g(X, Y) == broadcast_unzip(f, X, Y) # true
+@btime broadcast_unzip(f, $X, $Y) # 4.009 μs (2 allocations: 16.25 KiB)
+```
+
+Additionally, `broadcast_unzip` accepts more inputs (just like `map`) as long as their sizes match
+and `f` outputs a `Tuple` of a scalar-like object.
+
+```julia
+X, Y, Z = rand(1:5, 1024), rand(1:5, 1024), rand(1:5, 1024)
+f(x, y, z) = x ^ y ^ z, x / y / z, x * y * z, x / (y*z)
+out = broadcast_unzip(f, X, Y, Z)
+@assert out[1] == getindex.(f.(X, Y, Z), 1)
+
+@btime map(f, $X, $Y, $Z) # 13.682 μs (2 allocations: 32.05 KiB)
+@btime broadcast_unzip(f, $X, $Y, $Z) # 13.418 μs (6 allocations: 32.58 KiB)
+```
+
+## Performance caveat -- type stability
+
+Just like the function `map`, the input function `f` has to be type-inferrable to work performantly:
+
+```julia
+X, Y = rand(1:5, 1024), rand(1:5, 1024)
+f_unstable(x, y) = x > 3 ? (x * y, x / y) : (x + y, x - y) # <-- many people make mistakes here
+f_stable(x, y) = x > 3 ? (Float64(x * y), Float64(x / y)) : (Float64(x + y), Float64(x - y))
+
+# unstable
+@btime map(f_unstable, $X, $Y); # 10.619 μs (1026 allocations: 56.25 KiB)
+@btime broadcast_unzip(f_unstable, $X, $Y); # 8.614 μs (428 allocations: 22.91 KiB)
+
+# stable
+@btime map(f_stable, $X, $Y); # 1.687 μs (1 allocation: 16.12 KiB)
+@btime broadcast_unzip(f_stable, $X, $Y); # 3.086 μs (2 allocations: 16.25 KiB)
+```
diff --git a/README_zh.md b/README_zh.md
new file mode 100644
index 0000000..c639e4b
--- /dev/null
+++ b/README_zh.md
@@ -0,0 +1,103 @@
+# UnzipLoops
+
+这个包提供一个函数: `broadcast_unzip(f, As...)`.
+
+## 一个简单的例子
+
+下面是大量使用广播时的一个典型场景
+
+```julia
+f(x, y) = x^y, x/y
+
+X, Y = [1, 2, 3, 4], [4, 3, 2, 1]
+out = f.(X, Y)
+```
+
+这是 `Vector{Tuple{Int, Int}}`， 但经常会遇到希望将它转换为 `Tuple{Vector{Int}, Vector{Int}}` 的需求
+(unzip)。 朴素的做法是手动对其进行拆分：
+
+```julia
+function g(X, Y)
+    out = f.(X, Y)
+    return getindex.(out, 1), getindex.(out, 2)
+end
+```
+
+但由于它引入了两次额外的广播（for循环）， 这会带来不必要的性能损失：
+
+```julia
+X, Y = rand(1:5, 1024), rand(1:5, 1024)
+@btime f.($X, $Y) # 3.834 μs (1 allocation: 16.12 KiB)
+@btime g($X, $Y) # 5.388 μs (4 allocations: 32.41 KiB)
+```
+
+对于 `f` 这个特定函数而言， 更高效的方案则是预先分配内存， 然后在一次循环中处理全部事情：
+
+```julia
+function g(X, Y)
+    @assert size(X) == size(Y)
+    Base.require_one_based_indexing(X, Y)
+
+    T = promote_type(Float64, eltype(X), eltype(Y))
+    N = ndims(X)
+    Z1 = Array{T,N}(undef, size(X))
+    Z2 = Array{T,N}(undef, size(X))
+    @inbounds @simd for i in eachindex(X)
+        v = f(X[i], Y[i])
+        Z1[i] = v[1]
+        Z2[i] = v[2]
+    end
+    return Z1, Z2
+end
+```
+
+虽然相比于 `f` 来说这并不是一个零开销的策略 （这背后涉及到内存布局的差异， 很难达到零开销）， 但它可以显著改善性能：
+
+```julia
+@btime g($X, $Y) # 3.999 μs (2 allocations: 16.25 KiB)
+```
+
+## `broadcast_unzip` 做了什么
+
+很显然， 上面的改写方案虽然有效， 但相比于朴素的方案来说还是显得有些太长了以至于代码的实际细节被掩盖， 影响可读性。
+`broadcast_unzip` 的目的就是为了让这件事变得更简单和高效：
+
+```julia
+g(X, Y) == broadcast_unzip(f, X, Y) # true
+@btime broadcast_unzip(f, $X, $Y) # 4.009 μs (2 allocations: 16.25 KiB)
+```
+
+`broadcast_unzip` 的名字来源于它是两个功能的组合:
+
+- broadcast: `Z = f.(X, Y)`
+- unzip: `getindex.(Z, 1), getindex.(Z, 2)`
+
+`broadcast_unzip` 支持多个输入参数， 但要求每个输入参数的尺寸保持一致， 同时函数 `f` 的输出应该是一
+个 `Tuple` 且每一项是一个标量值。
+
+```julia
+X, Y, Z = rand(1:5, 1024), rand(1:5, 1024), rand(1:5, 1024)
+f(x, y, z) = x ^ y ^ z, x / y / z, x * y * z, x / (y*z)
+out = broadcast_unzip(f, X, Y, Z)
+@assert out[1] == getindex.(f.(X, Y, Z), 1)
+
+@btime map(f, $X, $Y, $Z) # 13.682 μs (2 allocations: 32.05 KiB)
+@btime broadcast_unzip(f, $X, $Y, $Z) # 13.418 μs (6 allocations: 32.58 KiB)
+```
+
+## 性能要点 -- 类型稳定
+
+与 `map` 一样， 输入函数 `f` 必须要类型稳定才能获得稳定的性能：
+
+```julia
+X, Y = rand(1:5, 1024), rand(1:5, 1024)
+f_unstable(x, y) = x > 3 ? (x * y, x / y) : (x + y, x - y)
+f_stable(x, y) = x > 3 ? (Float64(x * y), Float64(x / y)) : (Float64(x + y), Float64(x - y))
+
+# 类型不稳定
+@btime map(f_unstable, $X, $Y); # 10.619 μs (1026 allocations: 56.25 KiB)
+@btime broadcast_unzip(f_unstable, $X, $Y); # 8.614 μs (428 allocations: 22.91 KiB)
+
+# 类型稳定
+@btime map(f_stable, $X, $Y); # 1.687 μs (1 allocation: 16.12 KiB)
+@btime broadcast_unzip(f_stable, $X, $Y); # 3.086 μs (2 allocations: 16.25 KiB)
diff --git a/src/UnzipLoops.jl b/src/UnzipLoops.jl
new file mode 100644
index 0000000..57b5e12
--- /dev/null
+++ b/src/UnzipLoops.jl
@@ -0,0 +1,131 @@
+module UnzipLoops
+
+export broadcast_unzip
+
+struct Unroll{N} end
+@generated function Unroll{N}(f, args...) where {N}
+    code = Expr(:block)
+    for i in 1:N
+        push!(code.args, :(f($i, args...)))
+    end
+    push!(code.args, :nothing)
+    return code
+end
+
+@generated function _teltypes(::Type{Ts}) where {Ts<:Tuple}
+    if Ts == Tuple
+        error("Cannot infer element types of empty tuple")
+    else
+        Tuple{(eltype(t) for t in Ts.parameters)...}
+    end
+end
+
+@generated function _create_vecs_from_record_type(::Type{Ts}, sz) where {Ts}
+    ndim = length(sz.parameters)
+    if Ts != Tuple && Ts <: Tuple && Ts isa DataType
+        Expr(:tuple, [:(Array{$et,$ndim}(undef, sz)) for et in Ts.parameters]...)
+    else
+        Expr(:tuple, [:(Array{Any,$ndim}(undef, sz)) for _ in 1:ndim]...)
+    end
+end
+
+@inline function _infer_record_type(f, xs)
+    @static if isdefined(Core, :Compiler)
+        T = Core.Compiler.return_type(f, _teltypes(Core.Typeof(xs)))
+        Base.promote_typejoin_union(T)
+    else
+        Any
+    end
+end
+
+struct GetUnzipVectors{F} end
+@inline function GetUnzipVectors{F}(@specialize(f::F), @specialize(xs::Tuple)) where {F}
+    length(xs) === 0 && error("get_unzip_vectors() expects more than one array input.")
+    t = _infer_record_type(f, xs)
+    return _create_vecs_from_record_type(t, size(xs[1]))
+end
+
+@noinline function _check_size(As)
+    length(As) > 1 || error("function f should accept more than one argument.")
+    out_size = size(As[1])
+    all(isequal(out_size), size.(As)) || throw(DimensionMismatch("size must match"))
+    return Base.require_one_based_indexing(As...)
+end
+
+struct _Kernel!{M,N} end
+
+@inline @generated function _Kernel!{M,N}(out, As, f) where {M,N}
+    rhs_As = [:(As[$i]) for i in 1:M]
+    lhs_As = [Symbol(:As, i) for i in 1:M]
+    rhs_out = [:(out[$i]) for i in 1:N]
+    lhs_out = [Symbol(:out, i) for i in 1:N]
+    lhs_tmp = [Symbol(:tmp, i) for i in 1:N]
+    code = Expr(:block)
+
+    for i in 1:M
+        push!(code.args, Expr(:(=), lhs_As[i], rhs_As[i]))
+    end
+    for i in 1:N
+        push!(code.args, Expr(:(=), lhs_out[i], rhs_out[i]))
+    end
+    loopblock = Expr(:block)
+
+    push!(
+        loopblock.args,
+        Expr(
+            :(=), Expr(:tuple, lhs_tmp...), Expr(:call, :f, [:($v[I]) for v in lhs_As]...)
+        ),
+    )
+
+    for i in 1:N
+        push!(loopblock.args, Expr(:(=), :($(lhs_out[i])[I]), lhs_tmp[i]))
+    end
+
+    push!(
+        code.args,
+        quote
+            @inbounds @simd for I in eachindex(As[1])
+                $loopblock
+            end
+        end,
+    )
+    return code
+end
+
+"""
+    broadcast_unzip(f, As...)
+
+For function `f` that outputs a `Tuple`, this function works similar to `map(f, As...)` but
+outputs a `Tuple` of arrays instead of an array of `Tuple`s.
+
+# Examples
+
+```julia
+julia> using UnzipLoops
+
+julia> f(x, y) = x + y, x - y
+f (generic function with 1 method)
+
+julia> X, Y = [1, 2, 3, 4], [4, 3, 2, 1]
+([1, 2, 3, 4], [4, 3, 2, 1])
+
+julia> map(f, X, Y) # array of tuple
+4-element Vector{Tuple{Int64, Int64}}:
+ (5, -3)
+ (5, -1)
+ (5, 1)
+ (5, 3)
+
+julia> broadcast_unzip(f, X, Y) # tuple of array
+([5, 5, 5, 5], [-3, -1, 1, 3])
+```
+"""
+function broadcast_unzip(f::F, As...) where {F}
+    _check_size(As)
+    out = GetUnzipVectors{F}(f, As)
+    M, N = length(As), length(out)
+    _Kernel!{M,N}(out, As, f)
+    return out
+end
+
+end # module UnzipLoops
diff --git a/test/runtests.jl b/test/runtests.jl
new file mode 100644
index 0000000..b6ff979
--- /dev/null
+++ b/test/runtests.jl
@@ -0,0 +1,44 @@
+using UnzipLoops
+using Test
+using OffsetArrays
+
+@testset "UnzipLoops" begin
+    X, Y = rand(1:5, 1024), rand(1:5, 1024)
+    f(x, y) = x + y, x - y
+    function g(X, Y)
+        @assert size(X) == size(Y)
+        Base.require_one_based_indexing(X, Y)
+
+        T = promote_type(eltype(X), eltype(Y))
+        N = ndims(X)
+        Z1 = Array{T,N}(undef, size(X))
+        Z2 = Array{T,N}(undef, size(X))
+        @inbounds @simd for i in eachindex(X)
+            v = f(X[i], Y[i])
+            Z1[i] = v[1]
+            Z2[i] = v[2]
+        end
+        return Z1, Z2
+    end
+    Z1, Z2 = g(X, Y)
+    Z1′, Z2′ = broadcast_unzip(f, X, Y)
+    @test Z1 == Z1′
+    @test Z2 == Z2′
+
+    X, Y, Z = rand(1:5, 1024), rand(1:5, 1024), rand(1:5, 1024)
+    f(x, y, z) = x ^ y ^ z, x / y / z, x * y * z, x / (y*z)
+    out = broadcast_unzip(f, X, Y, Z)
+    out_aos = map(f, X, Y, Z)
+    @test out[1] == getindex.(out_aos, 1)
+    @test out[2] == getindex.(out_aos, 2)
+    @test out[3] == getindex.(out_aos, 3)
+    @test out[4] == getindex.(out_aos, 4)
+
+    @test_throws DimensionMismatch broadcast_unzip(+, [1, 2, 3], [4 5 6])
+
+    # It's not supported yet, but we'd like to support generic array in the future
+    x = OffsetArray([1, 2, 3], -1)
+    msg = "offset arrays are not supported but got an array with index other than 1"
+    @test_throws ArgumentError(msg) broadcast_unzip(+, x, x)
+    @test_broken broadcast_unzip(+, x, x)
+end