Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Features indexpool #1

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2d228f9
Atualizando gitignore
Conradox Jun 19, 2020
68d1500
Adicionando dependencias
Conradox Jun 19, 2020
40e5102
Adicionando primeiro prototipo
Conradox Jun 19, 2020
c8de192
atualizando gitignore
filipebraida Jun 19, 2020
fdf4119
removendo codigo antigo
filipebraida Jun 19, 2020
04d2076
adicionando primeiro iniciativa de buffer
filipebraida Jun 19, 2020
2bb7e4a
primeiro prototipo de buffer e stream com um indice
filipebraida Jun 19, 2020
159568a
adicionado uns testes de limite
filipebraida Jun 19, 2020
82f9a77
adicioando teste para buffer
filipebraida Jun 19, 2020
5976579
refatorando
filipebraida Jun 19, 2020
72f6d3a
reorganizando
filipebraida Jun 19, 2020
4219809
adicionado o datasets no modeulo
filipebraida Jun 19, 2020
6f21bda
removendo arquivo desnecessario
filipebraida Jun 19, 2020
365b024
corrigindo bug
filipebraida Jun 19, 2020
08ca699
refactoring do metodo next
filipebraida Jun 19, 2020
5dbfaa0
renomeando a variavel
filipebraida Jun 19, 2020
a10c674
Criando sistema de download para os datastreams
Conradox Jun 22, 2020
e42c451
Adicionando throw para o getindex do stream
Conradox Jun 22, 2020
aef07a2
mudança nas variaveis
Conradox Jun 30, 2020
7d398c0
Adicionando index para as instâncias
Conradox Jun 30, 2020
920796d
Adicionando acesso através de dois index
Conradox Jul 1, 2020
552f3ce
Revert "Adicionando throw para o getindex do stream"
Conradox Jun 22, 2020
6f51557
Unindo sistema de download com indexing
Conradox Jul 1, 2020
f71abda
refatorando o codigo
filipebraida Jul 2, 2020
915c1aa
reorganizando para os novos nomes pool e stream
filipebraida Jul 2, 2020
014bfc1
reorganizando o codigo
filipebraida Jul 2, 2020
a5f7bdd
Adicionando indexação a partir de um índice ao pool
Conradox Jul 3, 2020
303fe3e
Adicionando indexação a partir de dois índices ao pool
Conradox Jul 3, 2020
d92a3cb
Modificação para que os datasets sejam retornados como streams
Conradox Jul 3, 2020
5bcaadc
Adicionando indexação a partir de 3 índices
Conradox Jul 7, 2020
b613837
Finalização da indexação
Conradox Jul 10, 2020
b70c3a9
Desenho de um protótipo
filipebraida Jul 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
/dev/
/docs/build/
/docs/site/
datasets/
11 changes: 10 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,17 @@ uuid = "ebb30991-6a3b-4324-962c-6bc29053301c"
authors = ["Pedro Conrado"]
version = "0.1.0"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CSVFiles = "5d742f6a-9f54-50ce-8119-2520741973ca"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
julia = "0.1"
julia = "1.0"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
7 changes: 6 additions & 1 deletion src/EasyStream.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
module EasyStream
using DataFrames

greet() = print("Hello World!")
include("source.jl")
include("stream.jl")
include("pool.jl")
include("datasets.jl")

using .DatasetsStreams
end # module
47 changes: 47 additions & 0 deletions src/datasets.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
module DatasetsStreams
using CSV, EasyStream

export Dataset1CDT, DatasetUG_2C_5D

const defdir = joinpath(dirname(@__FILE__), "..", "datasets")

function get1cdtdata(dir)
mkpath(joinpath(defdir, "synthetic"))
path = download("https://raw.githubusercontent.com/Conradox/datastreams/master/sinthetic/1CDT.csv")
mv(path, joinpath(defdir, "synthetic/1CDT.csv"))
end

function getug2c5ddata(dir)
mkpath(joinpath(defdir, "synthetic"))
path = download("https://raw.githubusercontent.com/Conradox/datastreams/master/sinthetic/UG_2C_5D.csv")
mv(path, joinpath(defdir, "synthetic/UG_2C_5D.csv"))
end

function Dataset1CDT(initial_size::Int, flux_size::Int)::EasyStream.MemoryStream
filename = "$(defdir)/synthetic/1CDT.csv"

isfile(filename) || get1cdtdata(defdir)

data = CSV.read(filename; header = false)

stream = EasyStream.MemoryStream(data, initial_size, flux_size)

return stream
end

Dataset1CDT() = Dataset1CDT(150, 1)

function DatasetUG_2C_5D(initial_size::Int, flux_size::Int)::EasyStream.MemoryStream
filename = "$(defdir)/synthetic/UG_2C_5D.csv"

isfile(filename) || getug2c5ddata(defdir)

data = CSV.read(filename; header = false)

stream = EasyStream.MemoryStream(data, initial_size, flux_size)

return stream
end

DatasetUG_2C_5D() = DatasetUG_2C_5D(150, 1)
end
3 changes: 3 additions & 0 deletions src/pool.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
mutable struct Pool
data::Vector
end
127 changes: 127 additions & 0 deletions src/pool_bk.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
mutable struct Pool{T <: Stream, N}
stream::T
data::Vector{N}
mapping::Vector{Vector{Bool}}
size::Int64
N
end

function Pool(stream::Stream)
data = Vector{DataFrame}()
streamdata = next!(stream)
push!(data, streamdata)
mapping = Vector{Vector{Bool}}()
push!(mapping, ones(Bool, size(streamdata, 1)))

return Pool(stream, data, mapping, size(streamdata, 1), DataFrame)
end

function next!(pool::Pool)
streamdata = next!(pool.stream)
pool.size += size(streamdata, 1)

#push!(pool.mapping, rand(Bool, size(streamdata, 1)))
push!(pool.mapping, ones(Bool, size(streamdata, 1)))
push!(pool.data, streamdata)
return streamdata
end


##Utils
function useble_length(pool)
count = 0
for i=1:size(pool.data, 1)
for j=1:size(pool.data[i], 1)
if pool.mapping[i][j]
count += 1
end
end
end
return count
end

##Indexing - Using three indexes to move in data through the instances
function Base.getindex(pool::Pool, instance::Int)
count = 1
for i=1:size(pool.data, 1)
for j=1:size(pool.data[i], 1)
if pool.mapping[i][j]
if count == instance
return pool.data[i][j, :]
end
count += 1
end
end
end
end

function Base.getindex(pool::Pool, i::Colon)
data = pool.N()
for i=1:useble_length(pool)
push!(data, pool[i])
end
return data
end

function Base.getindex(pool::Pool, range::UnitRange{Int64})
data = pool.N()
for i in range
push!(data, pool[i])
end
return data
end

##Indexing - Using two indexes to move in data through the instances and features

Base.getindex(pool::Pool, instance::Int, feature::Int) = pool[instance][feature]

Base.getindex(pool::Pool, instance::Int, c::Colon) = pool[instance]

Base.getindex(pool::Pool, instance::Int, range::UnitRange{Int64}) = pool[instance][range]

Base.getindex(pool::Pool, c::Colon, feature::Int) = pool[:][:, feature]

Base.getindex(pool::Pool, c::Colon, range::UnitRange{Int64}) = pool[:][:, range]

Base.getindex(pool::Pool, c1::Colon, c2::Colon) = pool[:]

Base.getindex(pool::Pool, range::UnitRange{Int64}, feature::Int) = pool[range][:, feature]

Base.getindex(pool::Pool, range::UnitRange{Int64}, c::Colon) = pool[range]

Base.getindex(pool::Pool, range::UnitRange{Int64}, range2::UnitRange{Int64}) = pool[range][:, range2]

##Indexing - Using three indexes to move in data through the instances, features, samples


function Base.getindex(pool::Pool, instance::Colon, feature::Colon, sample::Int)
count = 1
data = pool.N()
for j=1:size(pool.data[sample], 1)
if pool.mapping[sample][j]
push!(data, pool.data[sample][j, :])
count += 1
end
end
return data
end

function Base.getindex(pool::Pool, instance::Colon, feature::Colon, sample::UnitRange{Int64})
count = 1
data = pool.N()
for i=range
for j=1:size(pool.data[i], 1)
if pool.mapping[i][j]
if count == instance
push!(data, pool.data[i][j, :])
end
count += 1
end
end
end
return data
end

Base.getindex(pool::Pool, instance::Colon, feature::Colon, sample::Colon) = pool[:]

Base.getindex(pool::Pool, instance, feature, sample) = pool[:, :, sample][instance, feature]
52 changes: 52 additions & 0 deletions src/source.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
using Tables

abstract type AbstractSource end

mutable struct Source <: AbstractSource
table
position::Int
initial_size::Int
batch::Int
end

function Source(table, initial_size::Int, batch::Int)
if !Tables.istable(table)
@error "não é um tipo table"
end

if initial_size > size(table, 1)
initial_size = size(table, 1)
@warn "initial size é maior que o arquivo e será definido para o tamanho do arquivo"
end

if initial_size == 0
@warn "initial size é zero"
end

if batch == 0
@warn "flux size é zero"
end

return Source(table, 0, initial_size, batch)
end

function next(source::Source)
if source.position < source.initial_size
source.position = source.initial_size
return source.table[1:source.initial_size, :]
end

if source.position >= size(source.table, 1)
return nothing
end

if source.position < source.initial_size
source.position = source.initial_size
index = 1:source.initial_size
else
index = (source.position + 1):(source.position + source.batch)
source.position = source.position + source.batch
end

return source.table[index, :]
end
24 changes: 24 additions & 0 deletions src/stream.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
struct Stream
source::AbstractSource
data_tables::Vector
end

Stream(source::AbstractSource) = Stream(source, Vector{Any}())

function next(stream::Stream; f::Function = copyall)
data = next(stream.source)

elements = f(size(data)[1], length(stream.data_tables))

for i=1:length(stream.data_tables)
append!(stream.data_tables[i], data[elements[:, i], :])
end
end

copyall(qnt_elements, qnt_tables) = ones(Bool, qnt_elements, qnt_tables)

function publish(stream::Stream, data_tables...)
for data_table in data_tables
push!(stream.data_tables, data_table)
end
end
Loading