Skip to content

Commit

Permalink
reorganize repo to more standard format (#5)
Browse files Browse the repository at this point in the history
* reorganize repo

* fix 32-bit tests

* omit CR from CRLF at EOL
  • Loading branch information
stevengj authored Mar 25, 2024
1 parent a19c430 commit 9b4b385
Show file tree
Hide file tree
Showing 12 changed files with 207 additions and 180 deletions.
74 changes: 74 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
name: CI
# Run on master, tags, or any pull request
on:
schedule:
- cron: '0 2 * * *' # Daily at 2 AM UTC (8 PM CST)
push:
branches: [master]
tags: ["*"]
pull_request:
concurrency:
# Skip intermediate builds: always.
# Cancel intermediate builds: only if it is a pull request build.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
version:
- "1.6" # Earliest supported release
- "1" # Latest release
os:
- ubuntu-latest
- macOS-latest
- windows-latest
arch:
- x64
- x86
exclude:
# Test 32-bit only on Linux
- os: macOS-latest
arch: x86
- os: windows-latest
arch: x86
include:
# Add specific version used to run the reference tests.
# Must be kept in sync with version check in `test/runtests.jl`,
# and with the branch protection rules on the repository which
# require this specific job to pass on all PRs
# (see Settings > Branches > Branch protection rules).
- os: ubuntu-latest
version: 1.10.0
arch: x64
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v1
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: actions/cache@v4
env:
cache-name: cache-artifacts
with:
path: ~/.julia/artifacts
key: ${{ runner.os }}-${{ matrix.arch }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
restore-keys: |
${{ runner.os }}-${{ matrix.arch }}-test-${{ env.cache-name }}-
${{ runner.os }}-${{ matrix.arch }}-test-
${{ runner.os }}-${{ matrix.arch }}-
${{ runner.os }}-
- uses: julia-actions/julia-buildpkg@latest
- run: |
git config --global user.name Tester
git config --global user.email [email protected]
- uses: julia-actions/julia-runtest@latest
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v4
with:
files: lcov.info
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: false
18 changes: 18 additions & 0 deletions .github/workflows/CompatHelper.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: CompatHelper
on:
schedule:
- cron: '0 0 * * *' # Everyday at midnight
workflow_dispatch:
jobs:
CompatHelper:
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
run: julia -e 'using CompatHelper; CompatHelper.main()'
15 changes: 15 additions & 0 deletions .github/workflows/TagBot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: TagBot
on:
issue_comment:
types:
- created
workflow_dispatch:
jobs:
TagBot:
if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
runs-on: ubuntu-latest
steps:
- uses: JuliaRegistries/TagBot@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
ssh: ${{ secrets.DOCUMENTER_KEY }}
65 changes: 0 additions & 65 deletions Manifest.toml

This file was deleted.

11 changes: 10 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,14 @@ authors = ["codegodz <[email protected]>"]
version = "0.1.0"

[deps]
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
StringViews = "354b36f9-a18e-4713-926e-db85100087ba"

[compat]
julia = "1.6"
StringViews = "1.3"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
22 changes: 10 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ To install use:

### Features
Currently we only have some basic features like reading a line and splitting it.
For examples on how to generate test data and run the codes below see [`src/test.jl`](https://github.com/JuliaStrings/ViewReader.jl/blob/master/src/test.jl)
For examples on how to generate test data and run the codes below see [`test/runtest.jl`](https://github.com/JuliaStrings/ViewReader.jl/blob/master/test/runtest.jl)

#### 1. eachlineV
**`eachlineV(file_path::String; buffer_size::Int64=10_000)`**
**`eachlineV(file_path::String; buffer_size::Int=10_000)`**


This function can be used just like the base[ `eachline` ](https://docs.julialang.org/en/v1/base/io-network/#Base.eachline " `eachline` ") in Julia. The argument `buffer_size` determines the size of the underlaying UInt8 vector. The `buffer_size` should be bigger than the longest line in a file. If this is uknown just use a big number like 1M. This function will throw a warning if no new line is found when the eof is not reached yet - giving a clue to increase the `buffer_size`.
This function can be used just like the base[ `eachline` ](https://docs.julialang.org/en/v1/base/io-network/#Base.eachline " `eachline` ") in Julia. The argument `buffer_size` determines the size of the underlaying UInt8 vector. The `buffer_size` should be bigger than the longest line in a file. If this is uknown just use a big number like 1M. This function will throw a warning if no new line is found when the eof is not reached yet - giving a clue to increase the `buffer_size`.

**Example**

Expand All @@ -48,16 +48,16 @@ Similar to the base [`split`](https://docs.julialang.org/en/v1/base/strings/#Bas

**Example**

For example to check how often we see the string "TARGET" at column 3 in a given file
For example to check how often we see the string "TARGET" at column 3 in a given file
```Julia

c = 0
for line in eachlineV("../data/test.txt")
data = splitV(line, '\t')
data = splitV(line, '\t')
if data[1] == "TARGET"
c +=1
c +=1
end
end
end
println(c)
```

Expand All @@ -82,13 +82,13 @@ c = 0
for line in eachlineV("../data/numbs.txt")
for item in splitV(line, '\t')
c += parseV(UInt32, item)
end
end
end
println(c)
```

### Benchmark
We added a simple benchmark in [`src/test.jl`](https://github.com/JuliaStrings/ViewReader.jl/blob/master/src/test.jl), for my computer with:
We added a simple benchmark in [`test/runtest.jl`](https://github.com/JuliaStrings/ViewReader.jl/blob/master/src/test.jl), for my computer with:
- `gen_string_data(10_000)`
- `gen_numb_data(10_000)`
- and a buffer_size of `10_000`
Expand All @@ -113,7 +113,5 @@ so the best is just to try some buffer sizes and see where it works optimally

To make this a bit more visual, we compared the base reader to the view reader.
On the:
- **x-axis** is the nubmer of lines in a file and
- **x-axis** is the nubmer of lines in a file and
- **y-axis** the time in seconds to iterate over them

![BenchmarkImage](https://www.linkpicture.com/q/reader_benchmark.png)
60 changes: 28 additions & 32 deletions src/FileReader.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@

using StringViews
using StaticArrays

###########################################################################
# Code to read from a file
# Code to read from a file
###########################################################################

struct BufferedReader{IOT <: IO}
io::IOT
buffer::Int64
tot_alloc::Int64
buffer::Int
tot_alloc::Int
arr::Vector{UInt8}
end

# Function to flip elements in an array to a specified offset(buffer size here)
function flip!(arr::Vector{UInt8}, buffer::Int64)
function flip!(arr::Vector{UInt8}, buffer::Int)
@inbounds @simd for i in 1:buffer
arr[i] = arr[i+buffer]
end
Expand All @@ -24,50 +23,50 @@ function read_next_chunk!(reader::BufferedReader)
# Move last read chunk to front of the array
# (except in first iter)
flip!(reader.arr, reader.buffer)

# Store new chunk in second part of the array
bytes_read::Int = readbytes!(reader.io, view(reader.arr, reader.buffer+1:reader.tot_alloc), reader.buffer)
bytes_read::Int = readbytes!(reader.io, view(reader.arr, reader.buffer+1:reader.tot_alloc), reader.buffer)

# If we read less than the buffer size we have to reset the array
# values after "bytes_read" as this is old data (previous read)
if bytes_read < reader.buffer
@inbounds for i in reader.buffer+bytes_read+1:reader.tot_alloc
reader.arr[i] = 0x00
end
end
end
end
end

function find_newline(reader::BufferedReader, state::Int64)
function find_newline(reader::BufferedReader, state::Int)
cur_stop = copy(state) + 1
@inbounds for i in (state + 1):reader.tot_alloc
if reader.arr[i] == 0x0a
return cur_stop:i-1, i
end
end

@inbounds for i in (state + 1):reader.tot_alloc
if reader.arr[i] == 0x0a
return cur_stop:(i > 1 && reader.arr[i-1] == 0x0d ? i-2 : i-1), i
end
end

return 0:0, cur_stop
end

function eachlineV(io::IO; buffer_size::Int64=10_000)
function eachlineV(io::IO; buffer_size::Int=10_000)
# Allocate buffer array
tot_alloc = buffer_size * 2
buffer_arr = zeros(UInt8, tot_alloc)
# We will set up a buffered reader through which we
buffer_arr = zeros(UInt8, tot_alloc)

# We will set up a buffered reader through which we
# stream the file bytes, >4x as fast as a regular reader
reader = BufferedReader(io, buffer_size, buffer_size*2, buffer_arr)

# Also populate the reader with the first chunk already
# Also populate the reader with the first chunk already
read_next_chunk!(reader)
return reader
end

function eachlineV(file_path::String; buffer_size::Int64=10_000)
function eachlineV(file_path::String; buffer_size::Int=10_000)
io = open(file_path, "r")
return eachlineV(io, buffer_size=buffer_size)
end


# Override in case we want to reuse buffers and handles
function eachlineV(io::IO, buffer_arr::Vector{UInt8})
Expand All @@ -79,28 +78,25 @@ function eachlineV(io::IO, buffer_arr::Vector{UInt8})
end

@inline function Base.iterate(reader::BufferedReader)
# This is the first iter so only the last half of the array is filled now
# This is the first iter so only the last half of the array is filled now
# hence start reading from buffer + 1
r, state = find_newline(reader, reader.buffer)
return StringView(view(reader.arr, r)), state
end

@inline function Base.iterate(reader::BufferedReader, state::Int64)
@inline function Base.iterate(reader::BufferedReader, state::Int)
r, state = find_newline(reader, state)
if r.start == 0
if !eof(reader.io)
read_next_chunk!(reader)
r, state = find_newline(reader, state - reader.buffer - 1)
else
close(reader.io)
return nothing
end
return nothing
end
end
# I twould be odd to not reach EOF but still not find
# I twould be odd to not reach EOF but still not find
# a full line, throw warning
r.stop == 0 && @warn ("Buffer probably too small")
return StringView(view(reader.arr, r)), state
end



Loading

0 comments on commit 9b4b385

Please sign in to comment.