SINDY Paper discussion

Date: November 03, 2024

These are rough lecture notes for a paper review discussion held spring 2024 with the SMU data Applied Mathematics/Data Science Journal Club. This work is simply a partial account of the content in the Cited paper.

Author: Mason A. McCallum

Citation

[1] Bakarji J, Champion K, Nathan Kutz J, Brunton SL. 2023 Discovering governing equations from partial measurements with deep delay autoencoders. Proc. R. Soc. A 479: 20230422. https://doi.org/10.1098/rspa.2023.0422
[2] Niall M. Mangan, Steven L. Brunton, Joshua L. Proctor, J. Nathan Kutz. Inferring biological networks by sparse identification of nonlinear dynamics. arXiv:1605.08368

Introduction to SINDY

Here we will empirically determine a mathematical model of observed dynamical variables. To employ this technique we must collect time series data and coorisponding time derivative data. Let $x:\mathbb{R}^+ \rightarrow \mathbb{R}^d$

$X=[x(t_1), x(t_2),...,x(t_m)]^T$ $\dot{X}=[\dot{x}(t_1), \dot{x}(t_2),...,\dot{x}(t_m)]^T$

These times series matrices $X$ and $\dot{X}$ are $\mathbb{R}^{m\times d}$

We will regularize our optimization to promote models with as few terms as possible.

The continous time dynamical system is written as $\frac{d}{dt}x(t)=f(x)$ the discritized equivelant of this system via our time series data is $\dot{X}=\Theta(X)\xi$

Definition: The Library $\Theta$ is a large matrix $\Theta(X)\in\mathbb{R}^{m\times s}$ $\Theta(X)=[1\;\; X_1\;\;X_2\;\;X_3\;\; X_1X_2\;\; X_1X_3\;\; X_3X_2\;\; X_1^2\cdot\cdot\cdot X^d \cdot\cdot\cdot sin(X)\cdot\cdot\cdot ]$ Each column in the above definition corresponds to applying some mapping to our time series data.

Definition: The coefincent matrix $\xi \in \mathbb{R}^{s \times d}$ is a sparce matrix which we will optimize to satisfy $\dot{X}=\Theta(X)\xi$. This matrix serves as a means of selecting which library functions belong in the model.

\[L_{SINDY}(\xi')=||\dot{X}-\Theta(X)\xi'||_2+\lambda||\xi'||_1\]

using Plots
using LinearAlgebra

FT = Float64

# define the Rossler attractor
@kwdef mutable struct Rossler
    dt::FT = 0.02
    a::FT = 0.2
    b::FT = 0.2
    c::FT = 5.7
    x::FT = 0.1
    y::FT = 1.0
    z::FT = 0.0
    dx::FT = 0.0
    dy::FT = 0.0
    dz::FT = 0.0
end

function step!(R::Rossler)
    R.dx = -R.y - R.z;             
    R.dy = R.x + R.a*R.y;       
    R.dz = R.b + R.z*(R.x-R.c);     

    R.x += R.dt * R.dx
    R.y += R.dt * R.dy
    R.z += R.dt * R.dz
end

function keytostring(keyline)
  numVars = size(keyline)[1]
  for var in 1:numVars
    print("*x",var,"^",keyline[var])
  end
end 

function printSystem(key,coefs)
  sizeLib, numVars = size(coefs)
  for var in 1:numVars
    print("\n ẋ",var,"=")
    for libterm in 1:sizeLib
      if !(coefs[libterm, var] ≈ 0.0)
        print("+ ",coefs[libterm, var],"(")
        keytostring(key[libterm,:])
        print(")")
      end
    end
  end
end

printSystem (generic function with 1 method)

Build data matrices

numSteps = 10000
attractor = Rossler()

# Time series data collection
X = zeros(FT, numSteps, 3) #ℜᵐˣ³
Ẋ = zeros(FT, numSteps, 3) #ℜᵐˣ³

for ti=1:numSteps
    X[ti, :] = Vector{FT}([attractor.x, attractor.y, attractor.z])
    Ẋ[ti, :] = Vector{FT}([attractor.dx, attractor.dy, attractor.dz])
    step!(attractor)
end

# Library Formation
funcs = [(x,y,z)->x^a*y^b*z^c for c=0:2 for b=0:2 for a=0:2];
labels = [(a,b,c) for c=0:2 for b=0:2 for a=0:2]
order = sortperm(labels, by = x->sum(x)*sum(x.!=0))
funcs = funcs[order]
labels = labels[order]

Θ = zeros(FT, numSteps, length(funcs));
xi = zeros(FT, length(funcs), 3)# Ξ

for (idx, func) in enumerate(funcs)
  Θ[:,idx] = func.(X[:,1], X[:,2], X[:,3])
end

ptrue = plot(X[:,1],X[:,2],X[:,3],legend=false, title="truth");

Optimize

$\begin{align*} &\dot{x}=-y-z\\ &\dot{y}=x+0.2y\\ &\dot{z}=0.2+z(x-5.7)\\ \end{align*}$

"""
SparsifyDynamics for finding ξ
Theta: Θ(X)
dXdt: Ẋ
lambda: Regularization parameter weighting sparsinty of ξ. 
"""
function sparsifyDynamics(Theta::Matrix{FT}, dXdt::Matrix{FT}, lambda::FT)
  Xi = Theta\dXdt   # Initial guess: Least-squares
  
  for k=1:10
    smallinds = (abs.(Xi) .< lambda)
    Xi[smallinds] .= 0  # Threasholding
    for col = 1:size(dXdt,2)
      biginds = .!smallinds[:,col]
      Xi[biginds,col] = Theta[:,biginds]\dXdt[:,col]
    end
  end
  return Xi
end

# Now we seek the optimization of 
# Ξ = argmin||Ẋ-Θ(X)Ξ||₂ + λ||Ξ||₁
λ = 0.055
xi = sparsifyDynamics(Θ, Ẋ, λ)  # Here one could use Lasso regression among other sparse optimizers

# Now to print out the system
coefs = xi
key = transpose(reinterpret(reshape, Int, labels))
printSystem(key,coefs)

 ẋ1=+ -1.0054302319769184(*x1^0*x2^1*x3^0)+ -0.9871832765872768(*x1^0*x2^0*x3^1)
 ẋ2=+ 0.9998938754508867(*x1^1*x2^0*x3^0)+ 0.22139086315654422(*x1^0*x2^1*x3^0)
 ẋ3=+ -5.765816978762432(*x1^0*x2^0*x3^1)+ 1.0220655880065033(*x1^1*x2^0*x3^1)+ 0.08193878455408737(*x1^0*x2^1*x3^1)

function fmodel(x,y,z)
    dX = [0.0, 0.0, 0.0]
    for col in 1:size(xi,2)
        for row in 1:size(xi,1)
            dX[col] += xi[row,col]*funcs[row](x,y,z)
        end
    end
    return dX
end

xmodel = zeros(FT, numSteps, 3)
xmodel[1,:] = [0.01, 1.0, 0.0]
for i=2:numSteps
    xmodel[i,:] = xmodel[i-1,:] +  0.02 * fmodel(xmodel[i-1,1], xmodel[i-1,2], xmodel[i-1,3])
end
pmodel = plot(xmodel[:,1],xmodel[:,2],xmodel[:,3],legend=false, title="model");
plot(ptrue, pmodel, layout=(1,2))

<?xml version=”1.0” encoding=”utf-8”?>

Implicit SINDy

As an asside I will describe a more general formulation of SINDy. Many mathematical models require rational terms. Such models frequently accur in Mathematical biology. This will be a formulation of SINDy with will allow for more general nonlinearities beyond polynomial or trigonometric functions. we will consider models of the form: $\dot{\bm{x}} = f(\bm{x})$ but here we allow each componont of $\bm{x}$ to be a rational function.

\[\dot{x}_k = \frac{f_N(\bm{x})}{f_D(\bm{x})}\]

or otherwise writen:

\[f_N(\bm{x})-f_D(\bm{x})\dot{x}_k = 0\]

This modivates the construction of a modified library for speices k. $\Theta(X, \dot{x}_k(t))=\left[\Theta_N(X)\;\;\;diag(\dot{x}_k)\Theta_D(X)\right]$

\[\Theta(X, \dot{x}_k(t))\xi_k = 0\]

This is the data matrix form of our implicit evolution equations. To achieve the disired sparce system of equations we seek a non-zero element of the null space of $\Theta(X, \dot{x}_k(t))$. Let the Matrix N be the matrix whos columns span such a null space. Now we seek the sparsest vector $\xi_k$ in the range of N via alternating direct methods (ADM). We restrict this optimization requiring some threashold $(\lambda)$ minimum magnitude of the vector $\xi_k$.

“The appropriate $(\lambda)$ is unknown a priori. Each $\xi(\lambda)$ produces an inferred model of varying accuracy and sparsity. from these models we calculate a Pareto front and select tho most parsimonious model… readily identifiable at the sharp drop-off on the error in $\Theta\xi=0$.” [2]

We now desire a formulation which does not restsrict one to just rational function nonlienarities but rather results which are some combination of rational nonlinearities, polynomials, and trigonometric functions.

Autoencoder: coordinate system discovery

In the above discussion we collected data in the coordinate system of the dynamical variables x,y,z. Suppose however the observed data were not in the dynamical coordinate system. This is akin to developing orbital mechanics before Copernicus suggested a heliocentric coordinate system. The differential equations based on a geocentric coordinate system are very messy. Once Copernicus posited a heliocentric coordinate system the dynamical system became discoverable and led to Kepler’s findings. In the following discussion we will observe data from a system. Then in the spirit of Copernicus discover an appropriate coordinate system for the dynamics this will be accomplished using an autoencoder to map our observables into a latent space. Then within that latent space SINDy will play the role of Kepler to discover a dynamical model. Aside from the very general conceptual detail of mathematical modeling in an appropriate coordinate system the interesting details of the autoencoder is the selection of loss functions to constrain the optimization and discover the appropriate mapping from observation space into the dynamical space.

consider the nonlinear differential equation $\frac{d}{dt}x(t)=f(x(t))$ Let $y\in\mathbb{R}^d$ be noisy measurments of the system via some observation operator. $y(t)=g(x(t))+\eta$. For example let g be the projection of the system onto $e_1$. Consider some unkown mapping from measure time delay space to the dynamical variables $z(t)=\phi(y(t))$. Appon finding $\phi$ (via an autoencoder) the goal is to discover the dynamical system $\frac{d}{dt}z(t)=h(z(t))$ (SINDy)

Time series data will be collected into the Hankel matrix $H= \begin{bmatrix} &y(t_1) &y(t_2) &\cdot\cdot\cdot &y(t_q)\\ &y(t_2) &y(t_3) &\cdot\cdot\cdot &y(t_{q+1})\\ &\cdot\cdot\cdot &\cdot\cdot\cdot &\cdot\cdot\cdot &\cdot\cdot\cdot\\ &y(t_n) &y(t_{n+1}) &\cdot\cdot\cdot &y(t_{n+q+1})\\ \end{bmatrix}=[h_1 h_2 \cdot\cdot\cdot h_q]$

The autoencoder network is a mapping from the column space of the H matrix into our latent space. The punchline is from noisy observation data (e.g. just collecting a single dynamical variable) the full dynamics can be discovered after finding the appropriate mapping into latent space. Note that in the paper one can account for noisy data by taking the reduced SVD of the H matrix.

Warning

This code is not finished. Not all the Loss functions are implemented therefore our search for the appropriate mapping to latent space is not going to be correct. Most of the bones are there but the loss functions are needed.

using Flux, Plots, ProgressMeter

# Time series data collection
X_sim = zeros(FT, numSteps, 3)
Ẋ_sim = zeros(FT, numSteps, 3)
for ti=1:numSteps
    step!(attractor)
    X_sim[ti, :] = Vector{FT}([attractor.x, attractor.y, attractor.z])
    Ẋ_sim[ti, :] = Vector{FT}([attractor.dx, attractor.dy, attractor.dz])
end

function hankel(data, d)
  len = length(data)
  q = len - d
  H = Matrix{typeof(data[1])}(undef, d, q)
  for col=1:q
    H[:, col] = data[col:col+d-1]
  end
  return H
end

n = 60
n2 = 30
m = 15
r = 3
H = hankel(X_sim[:,1], n)
data = [Vector(H[:,col]) for col=1:size(H)[2]]
enc = Chain(
            Dense(n => n2),
            Dense(n2 => m), 
            Dense(m => m), 
            Dense(m => m), 
            Dense(m => m), 
            Dense(m => r), 
          )
dec = Chain(
            Dense(r => m),
            Dense(m => m), 
            Dense(m => m), 
            Dense(m => m), 
            Dense(m => n2), 
            Dense(n2 => n)
          )
model = Chain(enc, dec)

optim = Flux.setup(Adam(), model)

@showprogress for epoch=1:100
  Flux.train!(model, data[1:5000], optim) do m, x
    sum((m(x).-x).^2)+(m[1](x)[1]-x[1])^2
  end
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:12[39m

latent_data = [model[1](d) for d in data]
X = latent_data
Ẋ = [(latent_data[idx]-latent_data[idx+1])/0.02 for idx=1:length(X)-1]
X = transpose(hcat(X[1:end-1]...))
Ẋ = Matrix(transpose(hcat(Ẋ...)))

attractor = Rossler()

# Library Formation
funcs = [(x,y,z)->x^a*y^b*z^c for c=0:2 for b=0:2 for a=0:2];
labels = [(a,b,c) for c=0:2 for b=0:2 for a=0:2]
order = sortperm(labels, by = x->sum(x)*sum(x.!=0))
funcs = funcs[order]
labels = labels[order]
numSteps = size(X)[1]
Θ = zeros(FT, numSteps, length(funcs));
xi = zeros(FT, length(funcs), 3)# Ξ

for (idx, func) in enumerate(funcs)
  Θ[:,idx] = func.(X[:,1], X[:,2], X[:,3])
end

# Now we seek the optimization of 
# Ξ = argmin||Ẋ-Θ(X)Ξ||₂ + λ||Ξ||₁
λ = 0.061
xi = sparsifyDynamics(Θ, Ẋ, λ)

coefs = xi
key = transpose(reinterpret(reshape, Int, labels));
printSystem(key,coefs)

function fmodel(x,y,z)
    dX = [0.0, 0.0, 0.0]
    for col in 1:size(xi,2)
        for row in 1:size(xi,1)
            dX[col] += xi[row,col]*funcs[row](x,y,z)
        end
    end
    return dX
end

xmodel = zeros(FT, numSteps, 3)
xmodel[1,:] = [0.01, 1.0, 0.0]
for i=2:numSteps
    xmodel[i,:] = xmodel[i-1,:] +  0.02 * fmodel(xmodel[i-1,1], xmodel[i-1,2], xmodel[i-1,3])
end
pmodel = plot(xmodel[:,1],xmodel[:,2],xmodel[:,3],legend=false, title="model");
plot(ptrue, pmodel, layout=(1,2))

 ẋ1=+ 0.6963388095479174(*x1^0*x2^0*x3^0)+ 0.2792826819737061(*x1^1*x2^0*x3^0)+ 0.061130985589659785(*x1^0*x2^0*x3^1)
 ẋ2=+ 39.65671143525014(*x1^0*x2^0*x3^0)+ -2.056409520378554(*x1^1*x2^0*x3^0)+ 1.8252778746922187(*x1^0*x2^1*x3^0)+ 5.282539361932167(*x1^0*x2^0*x3^1)+ 4.255475137871766(*x1^2*x2^0*x3^0)+ -0.1829295578441074(*x1^1*x2^1*x3^0)+ -0.49365590871893644(*x1^1*x2^0*x3^1)
 ẋ3=+ -115.7144459596724(*x1^0*x2^0*x3^0)+ 135.70854614965396(*x1^1*x2^0*x3^0)+ -5.519357602826046(*x1^0*x2^1*x3^0)+ -10.005406528755964(*x1^0*x2^0*x3^1)+ -16.16325533002859(*x1^2*x2^0*x3^0)+ 0.6774592686814631(*x1^1*x2^1*x3^0)+ 1.307410828751469(*x1^1*x2^0*x3^1)