Merge pull request #46 from PharmCat/dev

Raw column
PharmCat · Apr 9, 2024 · 4879913 · 4879913
2 parents f76cd21 + 1ce746b
commit 4879913
Show file tree

Hide file tree

Showing 15 changed files with 164 additions and 39 deletions.
diff --git a/.github/workflows/Tier1.yml b/.github/workflows/Tier1.yml
@@ -30,7 +30,6 @@ jobs:
     strategy:
       matrix:
         version:
-          - '1.6'
           - '1.8'
           - '1'
         os:

diff --git a/Project.toml b/Project.toml
@@ -3,7 +3,7 @@ uuid = "a1dec852-9fe5-11e9-361f-8d9fde67cfa2"
 keywords = ["lenearmodel", "mixedmodel"]
 desc = "Mixed-effects models with flexible covariance structure."
 authors = ["Vladimir Arnautov <[email protected]>"]
-version = "0.15.0"
+version = "0.15.1"
 
 [deps]
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -400,6 +400,13 @@ Metida.typeiii
 Metida.MILMM
 ```
 
+### Metida.RawCoding
+
+```@docs
+Metida.RawCoding
+```
+
+
 ## Not API functions
 
 ### Metida.contrast
@@ -443,3 +450,10 @@ Metida.tname
 ```@docs
 Metida.raneflenv  
 ```
+
+### Metida.edistance  
+
+```@docs
+Metida.edistance  
+```
+
diff --git a/docs/src/custom.md b/docs/src/custom.md
@@ -90,6 +90,7 @@ Example:
 ```@example lmmexample
 using Metida, DataFrames, CSV, CategoricalArrays
 
+spatdf       = CSV.File(joinpath(dirname(pathof(Metida)), "..", "test", "csv", "spatialdata.csv"); types = [Int, Int, String, Float64, Float64, Float64, Float64, Float64]) |> DataFrame
 ftdf = CSV.File(joinpath(dirname(pathof(Metida)), "..", "test", "csv",  "1fptime.csv"); types = [String, String, Float64, Float64]) |> DataFrame
 df0 = CSV.File(joinpath(dirname(pathof(Metida)), "..", "test", "csv",  "df0.csv"); types = [String, String, String, String,Float64, Float64, Float64]) |> DataFrame
 
@@ -143,3 +144,30 @@ repeated = Metida.VarEffect(Metida.@covstr(period|subject), CustomCovarianceStru
 )
 Metida.fit!(lmm)
 ```
+
+### Custom distance estimation for spatial structures
+
+If you want to use coordinates or some other structures for distance estimation you can define method [`Metida.edistance`](@ref) to calculate distance:
+
+```@example lmmexample
+function Metida.edistance(mx::AbstractMatrix{<:CartesianIndex}, i::Int, j::Int)
+    return sqrt((mx[i, 1][1] - mx[j, 1][1])^2 + (mx[i, 1][2] - mx[j, 1][2])^2)
+end
+```
+
+For example this method returns distance between two vectors represented as `CartesianIndex`.
+
+Make vector of `CartesianIndex`:
+
+```@example lmmexample
+spatdf.ci = map(x -> CartesianIndex(x[:x], x[:y]), eachrow(spatdf))
+```
+
+Then use new column as "raw" variable with  [`Metida.RawCoding`](@ref) contrast and fit the model:
+
+```@example lmmexample
+lmm = Metida.LMM(@formula(r2 ~ f), spatdf;
+    repeated = Metida.VarEffect(Metida.@covstr(ci|1), Metida.SPEXP; coding = Dict(:ci => Metida.RawCoding())),
+    )
+Metida.fit!(lmm)
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -38,6 +38,10 @@ Implemented covariance structures:
 
 Actually Metida can fit datasets with wore than  160k observation and 40k subjects levels on PC with 64 GB RAM. This is not "hard-coded" limitation, but depends on your model and data structure. Fitting of big datasets can take a lot of time. Optimal dataset size is less than 100k observations with maximum length of block less than 400.
 
+!!! note
+
+    Julia v1.8 or higher required.
+
 ## Contents
 
 ```@contents

diff --git a/src/Metida.jl b/src/Metida.jl
@@ -3,7 +3,6 @@
 
 __precompile__()
 module Metida
-
 using ProgressMeter, LinearAlgebra, ForwardDiff, DiffResults, Random, Optim, LineSearches, MetidaBase#, SparseArrays#, Polyester#, LoopVectorization
 import StatsBase, StatsModels, Distributions
 
@@ -32,7 +31,7 @@ TOEPH, HeterogeneousToeplitz,
 TOEPHP, HeterogeneousToeplitzParameterized,
 SPEXP, SpatialExponential,
 SPPOW, SpatialPower,
-SPGAU, SpatialGaussian,
+SPGAU, SpatialGaussian, RawCoding,
 UN, Unstructured,
 CovarianceType,
 fit, fit!, LMM, VarEffect, theta, logreml, m2logreml, thetalength, dof_satter, dof_contain, rankx, caic, lcontrast, typeiii, estimate, contrast,

diff --git a/src/fit.jl b/src/fit.jl
@@ -63,7 +63,7 @@ Fit LMM model.
 * `refitinit` - true/false - if `true` - use last values for initial condition  (`false` by default)
 * `optmethod` - Optimization method. Look at Optim.jl documentation. (Newton by default)
 * `singtol` - singular tolerance = 1e-8
-* `maxthreads` - maximum threads = num_cores()
+* `maxthreads` - maximum threads = min(num_cores(), Threads.nthreads())
 
 """
 function fit!(lmm::LMM{T}; kwargs...) where T
@@ -87,7 +87,7 @@ function fit!(lmm::LMM{T}; kwargs...) where T
     :refitinit ∈ kwkeys ? refitinit = kwargs[:refitinit] : refitinit = false
     :optmethod ∈ kwkeys ? optmethod = kwargs[:optmethod] : optmethod = :default
     :singtol ∈ kwkeys ? singtol = kwargs[:singtol] : singtol = 1e-8
-    :maxthreads ∈ kwkeys ? maxthreads = kwargs[:maxthreads] : maxthreads = num_cores()
+    :maxthreads ∈ kwkeys ? maxthreads = kwargs[:maxthreads] : maxthreads = min(num_cores(), Threads.nthreads())
 
     # If model was fitted, previous results can be used if `refitinit` == true
     # Before fitting clear log

diff --git a/src/gmat.jl b/src/gmat.jl
@@ -148,7 +148,7 @@ function gmat!(mx, θ, ::TOEP_)
     if s > 1
         for n = 2:s
             @inbounds @simd for m = 1:n-1
-                mx[m, n] = de * θ[n-m+1]
+                mx[m, n] = de * θ[n - m + 1]
             end
         end
     end
@@ -177,8 +177,9 @@ function gmat!(mx, θ, ::TOEPH_)
     end
     if s > 1
         for n = 2:s
+            @inbounds mxnn = mx[n, n]
             @inbounds @simd for m = 1:n-1
-                mx[m, n] = mx[m, m] * mx[n, n] * θ[n-m+s]
+                mx[m, n] = mx[m, m] * mxnn * θ[n - m + s]
             end
         end
     end
@@ -195,8 +196,9 @@ function gmat!(mx, θ, ct::TOEPHP_)
     end
     if s > 1 && ct.p > 1
         for m = 1:s - 1
+            @inbounds mxmm = mx[m, m]
             for n = m + 1:(m + ct.p - 1 > s ? s : m + ct.p - 1)
-                @inbounds mx[m, n] = mx[m, m] * mx[n, n] * θ[n - m + s]
+                @inbounds mx[m, n] = mxmm * mx[n, n] * θ[n - m + s]
             end
         end
     end
@@ -213,8 +215,9 @@ function gmat!(mx, θ, ::UN_)
     end
     if s > 1
         for n = 2:s
+            @inbounds mxnn = mx[n, n]
             @inbounds @simd for m = 1:n - 1
-                mx[m, n] = mx[m, m] * mx[n, n] * θ[s + tpnum(m, n, s)]
+                mx[m, n] = mx[m, m] * mxnn * θ[s + tpnum(m, n, s)]
             end
         end
     end

diff --git a/src/reml.jl b/src/reml.jl
@@ -33,7 +33,7 @@ end
 ################################################################################
 #                     REML without provided β
 ################################################################################
-function reml_sweep_β(lmm, data, θ::Vector{T}; maxthreads::Int = 16) where T # Main optimization way - make gradient / hessian analytical / semi-analytical functions
+function reml_sweep_β(lmm, data, θ::Vector{T}; maxthreads::Int = 4) where T # Main optimization way - make gradient / hessian analytical / semi-analytical functions
     n             = length(lmm.covstr.vcovblock)
     N             = length(lmm.data.yv)
     c             = (N - lmm.rankx)*log(2π)

diff --git a/src/rmat.jl b/src/rmat.jl
@@ -167,6 +167,11 @@ end
     return sqrt(sum)
 end
 =#
+"""
+    edistance(mx::AbstractMatrix{T}, i::Int, j::Int) where T
+
+Distance between vector mx[i, :] and mx[j, :].
+"""
 function edistance(mx::AbstractMatrix{T}, i::Int, j::Int) where T
     sum = zero(T)
     @inbounds for c = 1:size(mx, 2)
@@ -304,8 +309,9 @@ function unrmat(θ::AbstractVector{T}, rz) where T
     end
     if rm > 1
         for m = 1:rm - 1
+            @inbounds mxmm = mx[m, m] 
             @inbounds @simd for n = m + 1:rm
-                mx[m, n] += mx[m, m] * mx[n, n] * θ[rm+tpnum(m, n, rm)]
+                mx[m, n] += mxmm * mx[n, n] * θ[rm + tpnum(m, n, rm)]
             end
         end
     end

diff --git a/src/utils.jl b/src/utils.jl
@@ -31,7 +31,7 @@ function nterms(rhs::Union{Tuple{Vararg{AbstractTerm}}, Nothing, AbstractTerm})
     p
 end
 """
-    Rerm name.
+    Term name.
 """
 tname(t::AbstractTerm) = "$(t.sym)"
 tname(t::InteractionTerm) = join(tname.(t.terms), " & ")
@@ -204,8 +204,8 @@ end
 function logreml(lmm)
     -m2logreml(lmm)/2
 end
-function m2logreml(lmm, theta)
-    reml_sweep_β(lmm, LMMDataViews(lmm), theta)[1]
+function m2logreml(lmm, theta; maxthreads::Int = num_cores())
+    reml_sweep_β(lmm, LMMDataViews(lmm), theta; maxthreads = maxthreads)[1]
 end
 ################################################################################
 

diff --git a/src/varstruct.jl b/src/varstruct.jl
@@ -1,5 +1,26 @@
 const CType = Union{FunctionTerm{typeof(+), Vector{Term}}, FunctionTerm{typeof(*), Vector{Term}}, FunctionTerm{typeof(&), Vector{Term}}}
 
+import StatsModels: ContrastsMatrix, AbstractContrasts, modelcols
+
+"""
+    mutable struct RawCoding <: AbstractContrasts
+
+Contrast for CategoricalTerm to get column "as it is" for model matrix.
+"""
+mutable struct RawCoding <: AbstractContrasts
+end
+function StatsModels.ContrastsMatrix(contrasts::RawCoding, levels::AbstractVector{T}) where T
+    ContrastsMatrix(ones(1,1),
+                             ["levels"],
+                             levels,
+                             contrasts)
+end
+function StatsModels.modelcols(t::CategoricalTerm{RawCoding, T, N}, d::NamedTuple) where T where N
+    #v = d[t.sym]
+    #reshape(v, length(v), 1)  
+    d[t.sym]
+end
+
 ################################################################################
 #                     @covstr macro
 ################################################################################
@@ -154,7 +175,7 @@ end
 """
     Covarince structure.
 """
-struct CovStructure{T} <: AbstractCovarianceStructure
+struct CovStructure{T, T2} <: AbstractCovarianceStructure
     # Random effects
     random::Vector{VarEffect}
     # Repearted effects
@@ -180,7 +201,7 @@ struct CovStructure{T} <: AbstractCovarianceStructure
     # unit range z column range for each random effect
     zrndur::Vector{UnitRange{Int}}
     # repeated effect parametrization matrix
-    rz::Vector{Matrix{T}}
+    rz::Vector{Matrix{T2}}
     # size 2 of z/rz matrix
     q::Vector{Int}
     # total number of parameters in each effect
@@ -209,7 +230,7 @@ struct CovStructure{T} <: AbstractCovarianceStructure
         z       = Matrix{Float64}(undef, rown, 0)
         #subjz   = Vector{BitMatrix}(undef, alleffl)
         dicts   = Vector{Dict}(undef, alleffl)
-        # 
+        # unit range z column range for each random effect
         zrndur  = Vector{UnitRange{Int}}(undef, length(random))
         # Number of random effects
         rn      = length(random)
@@ -218,7 +239,7 @@ struct CovStructure{T} <: AbstractCovarianceStructure
         # Number of repeated effects
         rpn     = length(repeated)
         # Z Matrix for repeated effect
-        rz      = Vector{Matrix{Float64}}(undef, rpn)
+        # rz      = Vector{Matrix{Float64}}(undef, rpn)
         # 
         #Theta parameter type
         ct      = Vector{Symbol}(undef, 0)
@@ -239,17 +260,30 @@ struct CovStructure{T} <: AbstractCovarianceStructure
                 if length(random[i].coding) == 0
                     fill_coding_dict!(random[i].model, random[i].coding, data)
                 end
-                schema[i] = apply_schema(random[i].model, StatsModels.schema(data, random[i].coding))
-                ztemp     = modelcols(MatrixTerm(schema[i]), data)
-                q[i]      = size(ztemp, 2)
+                if isa(random[i].model, ConstantTerm) # if only ConstantTerm in the model - data_ - first is collumn (responce)
+                    data_     = data[[first(keys(data))]] 
+                else
+                    data_     = data[StatsModels.termvars(random[i].model)] # only collumns for model
+                end
+                if isa(random[i].covtype.s, ZERO)
+                    schema[i] = InterceptTerm{false}()
+                    zsize     = 0
+                else
+                    schema[i] = apply_schema(random[i].model, StatsModels.schema(data_, random[i].coding))
+                    ztemp     = modelcols(MatrixTerm(schema[i]), data_)
+                    z         = hcat(z, ztemp)
+                    zsize     = size(ztemp, 2)
+                end
+
+                q[i]      = zsize
                 csp       = covstrparam(random[i].covtype.s, q[i])
                 t[i]      = sum(csp)
-                z         = hcat(z, ztemp)
+
                 fillur!(zrndur, i, q)
                 fillur!(tr, i, t)
                 symbs       = StatsModels.termvars(random[i].subj)
                 if length(symbs) > 0
-                    cdata     = tabcols(data, symbs) # Tuple(Tables.getcolumn(Tables.columns(data), x) for x in symbs)
+                    cdata     = tabcols(data, symbs) # Tuple(Tables.getcolumn(Tables.columns(data_), x) for x in symbs)
                     dicts[i]  = Dict{Tuple{eltype.(cdata)...}, Vector{Int}}()
                     indsdict!(dicts[i], cdata)
                 else
@@ -261,35 +295,44 @@ struct CovStructure{T} <: AbstractCovarianceStructure
                 append!(emap, fill(i, t[i]))
                 rtn += t[i]
             end
-
+
+        rz_      = Vector{Matrix}(undef, rpn)
         # REPEATED EFFECTS
         for i = 1:length(repeated)
 
             if length(repeated[i].coding) == 0
                 fill_coding_dict!(repeated[i].model, repeated[i].coding, data)
             end
-
-            schema[rn+i] = apply_schema(repeated[i].model, StatsModels.schema(data, repeated[i].coding))
-            rz[i]        = modelcols(MatrixTerm(schema[rn+i]), data)
+            if isa(repeated[i].model, ConstantTerm) # if only ConstantTerm in the model - data_ - first is collumn (responce)
+                data_     = data[[first(keys(data))]] 
+            else
+                data_     = data[StatsModels.termvars(repeated[i].model)] # only collumns for model
+            end
+
+            schema[rn + i] = apply_schema(repeated[i].model, StatsModels.schema(data_, repeated[i].coding))
+            #rz_[i]       = reduce(hcat, modelcols(schema[rn+i], data))
+            rz_[i]       = modelcols(MatrixTerm(schema[rn+i]), data_)
             symbs        = StatsModels.termvars(repeated[i].subj)
             if length(symbs) > 0
-                cdata       = tabcols(data, symbs) # Tuple(Tables.getcolumn(Tables.columns(data), x) for x in symbs)
-                dicts[rn+i]  = Dict{Tuple{eltype.(cdata)...}, Vector{Int}}()
-                indsdict!(dicts[rn+i], cdata)
+                cdata    = tabcols(data, symbs) # Tuple(Tables.getcolumn(Tables.columns(data), x) for x in symbs)
+                dicts[rn + i]  = Dict{Tuple{eltype.(cdata)...}, Vector{Int}}()
+                indsdict!(dicts[rn + i], cdata)
             else
                 dicts[rn+i]  = Dict(1 => collect(1:rown)) #changed to range
             end
 
-            sn[rn+i]     = length(dicts[rn+i])
-            q[rn+i]      = size(rz[i], 2)
+            sn[rn + i]   = length(dicts[rn+i])
+            q[rn + i]    = size(rz_[i], 2)
             csp          = covstrparam(repeated[i].covtype.s, q[rn+i])
-            t[rn+i]      = sum(csp)
-            tr[rn+i]     = UnitRange(sum(t[1:rn+i-1]) + 1, sum(t[1:rn+i-1]) + t[rn+i])
+            t[rn + i]    = sum(csp)
+            tr[rn + i]   = UnitRange(sum(t[1:rn+i-1]) + 1, sum(t[1:rn+i-1]) + t[rn+i])
             updatenametype!(ct, rcnames, csp, schema[rn+i], repeated[i].covtype.s)
-
             # emap
             append!(emap, fill(rn+i, t[rn+i]))
         end
+        T2  = typejoin(eltype.(rz_)...)
+        rz  = Vector{Matrix{T2}}(undef, rpn)
+        rz .= rz_
         # Theta length
         tl  = sum(t)
         ########################################################################
@@ -351,7 +394,7 @@ struct CovStructure{T} <: AbstractCovarianceStructure
         end
         esb = EffectSubjectBlock(sblock, nblock)
         #######################################################################
-        new{eltype(z)}(random, repeated, schema, rcnames, blocks, rn, rtn, rpn, z, esb, zrndur, rz, q, t, tr, tl, ct, emap, sn, maxn)
+        new{eltype(z), T2}(random, repeated, schema, rcnames, blocks, rn, rtn, rpn, z, esb, zrndur, rz, q, t, tr, tl, ct, emap, sn, maxn)
     end
 end
 ###############################################################################
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,7 +30,6 @@ jobs: @@
         strategy:
           matrix:
             version:
-              - '1.6'
               - '1.8'
               - '1'
             os:
@@ Expand Down @@