Logit transforms (#4)

* Add logitsample * Add logit transforms * tweaks * mutable types * Add tests; Add Temperature, Top_pk, Top_p, Top_k; Add deprecated functions for compat * Remove x86 and 1.9 testing * Update README * Add test for show method
MurrellGroup · Nov 28, 2024 · 00a13bf · 00a13bf
1 parent d8a397c
commit 00a13bf
Show file tree

Hide file tree

Showing 11 changed files with 299 additions and 88 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -24,13 +24,11 @@ jobs:
       matrix:
         version:
           - '1.10'
-          - '1.9'
           - 'pre'
         os:
           - ubuntu-latest
         arch:
           - x64
-          - x86
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2

diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@
 *.jl.mem
 /docs/Manifest.toml
 /docs/build/
+Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -1,17 +1,17 @@
 name = "LogitSamplers"
 uuid = "1b30fcfc-0ee9-4be2-9cfe-b2289b43e041"
 authors = ["murrellb <[email protected]> and contributors"]
-version = "1.0.0-DEV"
+version = "1.1.0-DEV"
 
 [deps]
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 NNlib = "0.9"
-Random = "1.11.0"
-StatsBase = "0.34"
+Random = "1"
+Statistics = "1"
 julia = "1.9"
 
 [extras]

diff --git a/README.md b/README.md
@@ -4,3 +4,36 @@
 [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://MurrellGroup.github.io/LogitSamplers.jl/dev/)
 [![Build Status](https://github.com/MurrellGroup/LogitSamplers.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/LogitSamplers.jl/actions/workflows/CI.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/MurrellGroup/LogitSamplers.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/MurrellGroup/LogitSamplers.jl)
+
+A Julia package for GPU-friendly sampling from logit distributions with various transformation methods commonly used in language models.
+
+## Usage
+
+The package provides a set of logit transforms to modify the distributions in the log domain.
+
+```julia
+using LogitSamplers
+
+# Create a temperature transform
+temperature = Temperature(1.5)
+
+# Create a top-p transform
+top_p = Top_p(0.5)
+
+# Compose a function that first applies temperature, then top-p
+transform = top_p ∘ temperature
+
+# Create a token index sampler function from the transform
+sampler = logitsample ∘ transform
+
+# or equivalently:
+sampler = logits -> logitsample(top_p(temperature(logits)))
+
+logits = randn(100)
+
+# Get token probabilities from the sampler
+probs = softmax(transform(logits))
+
+# Sample a logit index from the sampler
+index = sampler(logits)
+```
diff --git a/src/LogitSamplers.jl b/src/LogitSamplers.jl
@@ -1,9 +1,22 @@
 module LogitSamplers
 
-using NNlib, StatsBase
+using NNlib: softmax
+using Random
+using Statistics: std
 
-include("samplers.jl")
+include("mask.jl")
 
+include("sample.jl")
+export logitsample
+
+include("transforms.jl")
+export LogitTransform
+export Temperature
+export Top_pk, Top_p, Top_k
+export Min_p
+export Top_nσ
+
+include("deprecated.jl")
 export argmax_sampler, top_pk_sampler, min_p_sampler, top_nσ_sampler
 
 end
diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -0,0 +1,11 @@
+@deprecate argmax_sampler(logits; device=identity) Top_k(1)(device(logits))
+@deprecate argmax_sampler(; kwargs...) logits -> argmax_sampler(logits; kwargs...)
+
+@deprecate top_pk_sampler(logits; p = 0.5f0, k = 5, device = identity) Top_pk(p, k)(device(logits))
+@deprecate top_pk_sampler(; kwargs...) logits -> top_pk_sampler(logits; kwargs...)
+
+@deprecate min_p_sampler(logits; pbase = 0.5f0, device = identity) Min_p(pbase)(device(logits))
+@deprecate min_p_sampler(; kwargs...) logits -> min_p_sampler(logits; kwargs...)
+
+@deprecate top_nσ_sampler(logits; temperature = 1.0f0, n = 1.0f0, device = identity) Top_nσ(temperature, n)(device(logits))
+@deprecate top_nσ_sampler(; kwargs...) logits -> top_nσ_sampler(logits; kwargs...)
diff --git a/src/mask.jl b/src/mask.jl
@@ -0,0 +1,8 @@
+apply_mask(x::AbstractVector{T}, mask::AbstractVector{Bool}) where T<:AbstractFloat =
+    T(-Inf) * .!mask + x
+
+function create_mask(x::AbstractVector, indices::AbstractVector{Int})
+    mask = similar(x, Bool) .= false
+    mask[indices] .= true
+    return mask
+end
diff --git a/src/sample.jl b/src/sample.jl
@@ -0,0 +1,15 @@
+"""
+    logitsample([rng], logits, [buffer=similar(logits)]) -> Int
+
+Sample an index from a logit distribution using the Gumbel argmax trick.
+
+Alternatively pass a buffer to avoid allocating a new array when creating
+the random numbers.
+"""
+function logitsample(rng::AbstractRNG, x::AbstractVector{T}, u::AbstractVector{T}=similar(x)) where T<:AbstractFloat
+    length(x) == length(u) || throw(DimensionMismatch("Expected buffer of same length as logits"))
+    rand!(rng, u)
+    argmax(-log.(-log.(u)) + x)
+end
+
+@inline logitsample(args...) = logitsample(Random.default_rng(), args...)
diff --git a/src/samplers.jl b/src/samplers.jl
diff --git a/src/transforms.jl b/src/transforms.jl
@@ -0,0 +1,74 @@
+abstract type LogitTransform <: Function end
+
+Base.show(io::IO, ::MIME"text/plain", t::LogitTransform) = show(io, t)
+
+
+"""
+    Temperature(T)
+
+A logit transform that scales (divides) the logits by a temperature parameter.
+"""
+mutable struct Temperature{T<:Real} <: LogitTransform
+    T::T
+end
+
+(t::Temperature)(logits::AbstractVector{T}) where T = logits / T(t.T)
+
+
+"""
+    Top_pk(p, k)
+
+A logit transform that masks logits to only include tokens in the top `k` or the top `p` cumulative probability.
+"""
+mutable struct Top_pk{P<:Real,K<:Union{Integer,Nothing}} <: LogitTransform
+    p::P
+    k::K
+end
+
+function (t::Top_pk)(logits::AbstractVector{T}) where T<:AbstractFloat
+    0 < t.p <= 1 || throw(DomainError(t.p, "p must be in the interval (0, 1]"))
+    probs = softmax(logits)
+    sorted_probs = sort(probs, rev=true)
+    cutoff_p = maximum(sorted_probs[cumsum(sorted_probs) .>= t.p]; init=zero(T))
+    cutoff_k = t.k isa Integer ? maximum(sorted_probs[t.k:t.k]) : zero(T)
+    return apply_mask(logits, probs .>= max(cutoff_p, cutoff_k))
+end
+
+Top_p(p) = Top_pk(p, nothing)
+Top_k(k) = Top_pk(1, k)
+
+
+"""
+    Min_p(pbase)
+
+A logit transform that samples from the most probable tokens using the min-p strategy.
+
+See: https://arxiv.org/pdf/2407.01082
+"""
+mutable struct Min_p{T<:Real} <: LogitTransform
+    pbase::T
+end
+
+function (t::Min_p)(logits::AbstractVector)
+    p = softmax(logits)
+    return apply_mask(logits, p .>= t.pbase * maximum(p))
+end
+
+
+"""
+    Top_nσ(n)
+
+A logit transform that samples within `n` standard deviations of the maximum logit.
+
+Top-nσ is temperature-invariant, i.e. the candidate set does not change with temperature.
+
+See: https://arxiv.org/pdf/2411.07641
+"""
+mutable struct Top_nσ{T<:Real} <: LogitTransform
+    n::T
+end
+
+function (t::Top_nσ)(logits::AbstractVector)
+    M, σ = maximum(logits), std(logits)
+    return apply_mask(logits, logits .>= M - t.n * σ)
+end