Skip to content

Commit

Permalink
Check for duplicate characters using a bitmask with multiple uint32s (#5
Browse files Browse the repository at this point in the history
)

* Check for duplicate characters using a bitmask with multiple uint32s
  • Loading branch information
mprimeaux authored Oct 29, 2024
1 parent 27d39cd commit 0896971
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 194 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
- main

jobs:
release:
build:
runs-on: ubuntu-latest
env:
flags: ""
Expand Down
15 changes: 14 additions & 1 deletion CHANGELOG/CHANGELOG-1.x.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added
### Changed
- **DEBT:** Check for duplicate characters using a bitmask with multiple `uint32`s. A `uint32` array can represent `256` bits (`32` bits per `uint32 × 8 = 256`). This allows us to track each possible byte value without the limitations of a single uint64
### Deprecated
### Removed
### Fixed
### Security

---
## [1.6.0] - 2024-OCT-29

### Added
### Changed
- **DEBT:** Check for duplicate characters using a bitmask with multiple `uint32`s. A `uint32` array can represent `256` bits (`32` bits per `uint32 × 8 = 256`). This allows us to track each possible byte value without the limitations of a single uint64
### Deprecated
### Removed
### Fixed
Expand Down Expand Up @@ -79,7 +91,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
### Security

[Unreleased]: https://github.com/scriptures-social/platform/compare/v1.5.0...HEAD
[Unreleased]: https://github.com/scriptures-social/platform/compare/v1.6.0...HEAD
[1.6.0]: https://github.com/sixafter/nanoid/compare/v1.5.0...v1.6.0
[1.5.0]: https://github.com/sixafter/nanoid/compare/v1.4.0...v1.5.0
[1.4.0]: https://github.com/sixafter/nanoid/compare/v1.3.0...v1.4.0
[1.3.0]: https://github.com/sixafter/nanoid/compare/v1.2.0...v1.3.0
Expand Down
71 changes: 48 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ A simple, fast, and efficient Go implementation of [NanoID](https://github.com/a

- **Short & Unique IDs**: Generates compact and collision-resistant identifiers.
- **Cryptographically Secure**: Utilizes Go's crypto/rand package for generating cryptographically secure random numbers. This guarantees that the generated IDs are both unpredictable and suitable for security-sensitive applications.
- **Customizable Alphabet**: Define your own set of characters for ID generation.
- **Customizable Alphabet**: Define your own set of characters for ID generation with a minimum length of 2 characters and a maximum length of 256 characters.
- **Concurrency Safe**: Designed to be safe for use in concurrent environments.
- **High Performance**: Optimized with buffer pooling to minimize allocations and enhance speed.
- **Zero Dependencies**: Lightweight implementation with no external dependencies beyond the standard library.
Expand Down Expand Up @@ -222,29 +222,29 @@ goos: darwin
goarch: arm64
pkg: github.com/sixafter/nanoid
cpu: Apple M2 Ultra
BenchmarkGenerateDefault-24 3985082 300.7 ns/op 48 B/op 2 allocs/op
BenchmarkGenerateCustomAlphabet-24 3429874 346.0 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateShortID-24 3646383 327.2 ns/op 10 B/op 2 allocs/op
BenchmarkGenerateLongID-24 2557196 468.1 ns/op 128 B/op 2 allocs/op
BenchmarkGenerateMaxAlphabet-24 4532246 263.8 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateMinAlphabet-24 2507995 479.8 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateWithBufferPool-24 3468786 343.9 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDefaultParallel-24 1530394 790.9 ns/op 48 B/op 2 allocs/op
BenchmarkGenerateCustomAlphabetParallel-24 1386268 861.6 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateShortIDParallel-24 1421832 842.7 ns/op 10 B/op 2 allocs/op
BenchmarkGenerateLongIDParallel-24 1000000 1050 ns/op 128 B/op 2 allocs/op
BenchmarkGenerateExtremeConcurrency-24 1530957 785.7 ns/op 48 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_5-24 3659472 327.7 ns/op 10 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_10-24 3436932 346.0 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_20-24 3140282 381.1 ns/op 48 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_50-24 2580222 470.5 ns/op 128 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_100-24 1936257 617.2 ns/op 224 B/op 2 allocs/op
BenchmarkGenerateDifferentAlphabets/Alphabet_2-24 2510594 479.6 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDifferentAlphabets/Alphabet_6-24 3452442 346.3 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDifferentAlphabets/Alphabet_26-24 3901122 308.0 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDifferentAlphabets/Alphabet_38-24 3562468 336.3 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDefault-24 4318624 277.3 ns/op 48 B/op 2 allocs/op
BenchmarkGenerateCustomAlphabet-24 4156414 288.6 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateShortID-24 4416091 271.3 ns/op 10 B/op 2 allocs/op
BenchmarkGenerateLongID-24 2899802 418.8 ns/op 128 B/op 2 allocs/op
BenchmarkGenerateMaxAlphabet-24 4463840 266.2 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateMinAlphabet-24 4496391 269.1 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateWithBufferPool-24 3953864 288.0 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDefaultParallel-24 1730868 695.0 ns/op 48 B/op 2 allocs/op
BenchmarkGenerateCustomAlphabetParallel-24 1692622 727.6 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateShortIDParallel-24 1856241 674.7 ns/op 10 B/op 2 allocs/op
BenchmarkGenerateLongIDParallel-24 1297450 931.4 ns/op 128 B/op 2 allocs/op
BenchmarkGenerateExtremeConcurrency-24 1715571 685.2 ns/op 48 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_5-24 4389192 270.5 ns/op 10 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_10-24 4158094 288.1 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_20-24 3615992 326.2 ns/op 48 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_50-24 2885797 494.5 ns/op 128 B/op 2 allocs/op
BenchmarkGenerateDifferentLengths/Length_100-24 1597360 752.7 ns/op 224 B/op 2 allocs/op
BenchmarkGenerateDifferentAlphabets/Alphabet_2-24 4523349 264.1 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDifferentAlphabets/Alphabet_6-24 4158720 289.2 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDifferentAlphabets/Alphabet_26-24 4235298 284.6 ns/op 32 B/op 2 allocs/op
BenchmarkGenerateDifferentAlphabets/Alphabet_38-24 3745124 327.4 ns/op 32 B/op 2 allocs/op
PASS
ok github.com/sixafter/nanoid 34.903s
ok github.com/sixafter/nanoid 34.773s
```

* `ns/op` (Nanoseconds per Operation):
Expand All @@ -265,6 +265,31 @@ Nano ID generates unique identifiers based on the following:
2. Mapping to Alphabet: Each random byte is mapped to a character in a predefined alphabet to form the final ID.
3. Uniform Distribution: To ensure that each character in the alphabet has an equal probability of being selected, Nano ID employs techniques to avoid bias, especially when the alphabet size isn't a power of two.

### Custom Alphabet Constraints

* **Alphabet Length**: Must be between 2 and 256 unique single-byte characters.
* **Uniqueness**: All characters in the alphabet must be unique.
* **Character Encoding**: Only single-byte characters (byte) are supported.
* **Error Handling**: The generator will return specific errors if the alphabet doesn't meet the constraints.

1. Length Requirements:
* Minimum Length 2 Characters: An alphabet with fewer than two characters cannot produce diverse or secure IDs. At least two unique characters are necessary to generate a variety of IDs.
* Maximum Length 256 Characters: The implementation utilizes a byte-based approach where each character in the alphabet is represented by a single byte (`0-255`). This inherently limits the maximum number of unique characters to 256. Attempting to use an alphabet longer than 256 characters will result in an error.
2. Uniqueness of Characters:
* All Characters Must Be Unique. Duplicate characters in the alphabet can introduce biases in ID generation and compromise the randomness and uniqueness of the IDs. The generator enforces uniqueness by checking for duplicates during initialization. If duplicates are detected, it will return an `ErrDuplicateCharacters` error.
3. Character Encoding:
* Single-Byte Characters Only: The implementation is designed to work with single-byte (`byte`) characters, which correspond to values `0-255`. Using multi-byte characters (such as UTF-8 characters beyond the basic ASCII set) can lead to unexpected behavior and is not supported.
* Recommended Character Sets:
* URL-Friendly Characters: Typically, alphanumeric characters (`A-Z`, `a-z`, `0-9`) along with symbols like `-` and `_` are used to ensure that generated IDs are safe for use in URLs and file systems.
* Custom Sets: You can define your own set of unique single-byte characters based on your application's requirements.
4. Power-of-Two Considerations:
* Mask Calculation: The generator calculates a mask based on the number of bits required to represent the alphabet length minus one.
```go
k := bits.Len(uint(alphabetLen - 1))
mask := byte((1 << k) - 1)
```
* Implications: While the alphabet length doesn't need to be a power of two, the mask is used to efficiently reduce bias in random number generation. The implementation ensures that each character in the alphabet has an equal probability of being selected by using this mask.
## Contributing
Contributions are welcome. See [CONTRIBUTING](CONTRIBUTING.md)
Expand Down
109 changes: 62 additions & 47 deletions nanoid.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ var DefaultGenerator Generator

// Generate generates a Nano ID using the default generator and the default size.
func Generate() (string, error) {
return GenerateSize(DefaultSize)
return DefaultGenerator.Generate(DefaultSize)
}

// GenerateSize generates a Nano ID using the default generator.
Expand All @@ -37,31 +37,32 @@ func init() {
}

var (
ErrInvalidLength = errors.New("length must be positive")
ErrExceededMaxAttempts = errors.New("generate method exceeded maximum attempts, possibly due to invalid mask or alphabet")
ErrEmptyAlphabet = errors.New("alphabet must not be empty")
ErrAlphabetTooShort = errors.New("alphabet length must be at least 2")
ErrAlphabetTooLong = errors.New("alphabet length must not exceed 256")
ErrDuplicateCharacters = errors.New("alphabet contains duplicate characters")
ErrInvalidLength = errors.New("invalid length")
ErrInvalidAlphabet = errors.New("invalid alphabet")
ErrDuplicateCharacters = errors.New("duplicate characters in alphabet")
ErrExceededMaxAttempts = errors.New("exceeded maximum attempts")
)

const (
// DefaultAlphabet Default alphabet as per Nano ID specification.
DefaultAlphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
// DefaultAlphabet as per Nano ID specification.
const DefaultAlphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"

// DefaultSize Default size of the generated Nano ID: 21.
DefaultSize = 21
)
// DefaultSize is the default size of the generated Nano ID: 21.
const DefaultSize = 21

// maxAttemptsMultiplier defines the multiplier for maximum attempts based on length.
const maxAttemptsMultiplier = 10

// Generator holds the configuration for the Nano ID generator.
// Generator defines the interface for generating Nano IDs.
type Generator interface {
Generate(size int) (string, error)
}

// Configuration defines the interface for retrieving generator configuration.
type Configuration interface {
GetConfig() Config
}

// Config holds the configuration for the Nano ID generator.
type Config struct {
Alphabet []byte
AlphabetLen int
Expand All @@ -76,50 +77,68 @@ type generator struct {
}

// New creates a new Generator with buffer pooling enabled.
// It returns an error if the alphabet is invalid.
func New(alphabet string, randReader io.Reader) (Generator, error) {
return newGenerator(alphabet, randReader)
}

// newGenerator is an internal constructor for generator.
func newGenerator(alphabet string, randReader io.Reader) (Generator, error) {
if len(alphabet) == 0 {
return nil, ErrEmptyAlphabet
return nil, ErrInvalidAlphabet
}

if randReader == nil {
randReader = rand.Reader // Initialize here
randReader = rand.Reader
}

alphabetBytes := []byte(alphabet)
alphabetLen := len(alphabetBytes)

if alphabetLen < 2 {
return nil, ErrAlphabetTooShort
if alphabetLen < 2 || alphabetLen > 256 {
return nil, ErrInvalidAlphabet
}

if alphabetLen > 256 {
return nil, ErrAlphabetTooLong
}

// Check for duplicate characters
seen := make(map[byte]struct{}, alphabetLen)
// NOTE: Alternatively, a []bool slice can track seen characters. While slightly less memory-efficient than
// using a bitmask, it's straightforward and still performant.
//
// Check for duplicate characters using a boolean slice
// seen := make([]bool, 256)
// for _, b := range alphabetBytes {
// if seen[b] {
// return nil, ErrDuplicateCharacters
// }
// seen[b] = true
// }

// Check for duplicate characters using a bitmask with multiple uint32s
// A uint32 array can represent 256 bits (32 bits per uint32 × 8 = 256). This allows us to track each
// possible byte value without the limitations of a single uint64
var seen [8]uint32 // 8 * 32 = 256 bits
for _, b := range alphabetBytes {
if _, exists := seen[b]; exists {
idx := b / 32
bit := b % 32
if (seen[idx] & (1 << bit)) != 0 {
return nil, ErrDuplicateCharacters
}
seen[b] = struct{}{}
seen[idx] |= 1 << bit
}

// Calculate mask using power-of-two approach
k := bits.Len(uint(alphabetLen - 1))
if k == 0 {
return nil, ErrAlphabetTooShort
return nil, ErrInvalidAlphabet
}
mask := byte((1 << k) - 1)

// Calculate step based on mask
step := (8 * 128) / bits.OnesCount8(mask)

// Initialize buffer pool as a pointer
// Initialize buffer pool to store pointers to byte arrays
bufferPool := &sync.Pool{
New: func() interface{} {
b := make([]byte, step)
return &b // Store pointer to slice
var buffer [128]byte // Using a fixed-size array to avoid dynamic allocation
return &buffer
},
}

Expand All @@ -131,48 +150,43 @@ func New(alphabet string, randReader io.Reader) (Generator, error) {
Step: step,
},
randReader: randReader,
bufferPool: bufferPool, // Always assigned
bufferPool: bufferPool,
}, nil
}

// GenerateSize creates a new Nano ID of the specified length.
// It ensures that each character in the ID is selected uniformly from the alphabet.
// Pre-allocated errors are used to minimize memory allocations.
// Generate creates a new Nano ID of the specified length.
// It implements the Generator interface.
func (g *generator) Generate(length int) (string, error) {
if length <= 0 {
return "", ErrInvalidLength
}

id := make([]byte, length)
cursor := 0
maxAttempts := length * 10 // Prevent infinite loops
maxAttempts := length * maxAttemptsMultiplier
attempts := 0

// Retrieve a pointer to the buffer from the pool
bufferPtr := g.bufferPool.Get().(*[]byte)
buffer := *bufferPtr
defer func() {
for i := range buffer {
buffer[i] = 0
}
g.bufferPool.Put(bufferPtr) // Return the pointer to the pool
}()
bufferPtr := g.bufferPool.Get().(*[128]byte)
buffer := bufferPtr[:]
defer g.bufferPool.Put(bufferPtr) // Return the pointer to the pool

for cursor < length {
if attempts >= maxAttempts {
return "", ErrExceededMaxAttempts
}
attempts++

n, err := g.randReader.Read(buffer)
// Read full buffer
_, err := io.ReadFull(g.randReader, buffer)
if err != nil {
return "", err
}
buffer = buffer[:n]

for _, rnd := range buffer {
if int(rnd&g.config.Mask) < g.config.AlphabetLen {
id[cursor] = g.config.Alphabet[rnd&g.config.Mask]
rnd &= g.config.Mask
if int(rnd) < g.config.AlphabetLen {
id[cursor] = g.config.Alphabet[rnd]
cursor++
if cursor == length {
break
Expand All @@ -185,6 +199,7 @@ func (g *generator) Generate(length int) (string, error) {
}

// GetConfig returns the configuration for the generator.
// It implements the Configuration interface.
func (g *generator) GetConfig() Config {
return g.config
}
Loading

0 comments on commit 0896971

Please sign in to comment.