diff --git a/benchmarks/benchFourierLayer.jl b/benchmarks/benchFourierLayer.jl
index 71e6e31..99ebf46 100644
--- a/benchmarks/benchFourierLayer.jl
+++ b/benchmarks/benchFourierLayer.jl
@@ -1,10 +1,10 @@
-# Stolen from Flux as well
-
-for n in [2, 20, 200, 2000]
-    x = randn(Float32, n, 2000, 100)
-    model = FourierLayer(n, n, 2000, 100, 16)
-    println("CPU n=$n")
-    run_benchmark(model, x, cuda=false)
-    println("CUDA n=$n")
-    run_benchmark(model, x, cuda=true)    
-end
\ No newline at end of file
+# Stolen from Flux as well
+
+for n in [2, 20, 200, 2000]
+    x = randn(Float32, n, 2000, 100)
+    model = FourierLayer(n, n, 2000, 100, 16)
+    println("CPU n=$n")
+    run_benchmark(model, x, cuda = false)
+    println("CUDA n=$n")
+    run_benchmark(model, x, cuda = true)
+end
diff --git a/benchmarks/bench_utils.jl b/benchmarks/bench_utils.jl
index a20d019..1802802 100644
--- a/benchmarks/bench_utils.jl
+++ b/benchmarks/bench_utils.jl
@@ -1,51 +1,55 @@
-using BenchmarkTools
-using Flux
-using OperatorLearning
-using CUDA
-using Zygote: pullback
-
-# Stolen from Flux
-
-fw(m, x) = m(x)
-bw(back) = back(1f0)
-fwbw(m, ps, x) = gradient(() -> sum(m(x)), ps)
-  
-function run_benchmark(model, x; cuda=true)
-    
-    if cuda 
-        model = model |> gpu
-        x = x |> gpu
-    end
-
-    ps = Flux.params(model)
-    y, back = pullback(() -> sum(model(x)), ps)
-
-
-    if cuda
-        CUDA.allowscalar(false)
-        # CUDA.device!(3)
-        println("  forward")
-        fw(model, x); GC.gc(); CUDA.reclaim(); #warmup
-        @btime CUDA.@sync(fw($model, $x)) teardown=(GC.gc(); CUDA.reclaim())
-
-        println("  backward")
-        bw(back); GC.gc(); CUDA.reclaim(); #warmup
-        @btime CUDA.@sync(bw($back)) teardown=(GC.gc(); CUDA.reclaim())
-        
-        println("  forw and back")
-        fwbw(model, ps, x); GC.gc(); CUDA.reclaim(); #warmup
-        @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown=(GC.gc(); CUDA.reclaim())
-    else
-        println("  forward")
-        fw(model, x)  #warmup
-        @btime fw($model, $x)
-
-        println("  backward")
-        bw(back)  #warmup
-        @btime bw($back)
-
-        println("  forw and back")
-        fwbw(model, ps, x) # warmup
-        @btime fwbw($model, $ps, $x)
-    end
-end
\ No newline at end of file
+using BenchmarkTools
+using Flux
+using OperatorLearning
+using CUDA
+using Zygote: pullback
+
+# Stolen from Flux
+
+fw(m, x) = m(x)
+bw(back) = back(1.0f0)
+fwbw(m, ps, x) = gradient(() -> sum(m(x)), ps)
+
+function run_benchmark(model, x; cuda = true)
+    if cuda
+        model = model |> gpu
+        x = x |> gpu
+    end
+
+    ps = Flux.params(model)
+    y, back = pullback(() -> sum(model(x)), ps)
+
+    if cuda
+        CUDA.allowscalar(false)
+        # CUDA.device!(3)
+        println("  forward")
+        fw(model, x);
+        GC.gc();
+        CUDA.reclaim(); #warmup
+        @btime CUDA.@sync(fw($model, $x)) teardown=(GC.gc(); CUDA.reclaim())
+
+        println("  backward")
+        bw(back);
+        GC.gc();
+        CUDA.reclaim(); #warmup
+        @btime CUDA.@sync(bw($back)) teardown=(GC.gc(); CUDA.reclaim())
+
+        println("  forw and back")
+        fwbw(model, ps, x);
+        GC.gc();
+        CUDA.reclaim(); #warmup
+        @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown=(GC.gc(); CUDA.reclaim())
+    else
+        println("  forward")
+        fw(model, x)  #warmup
+        @btime fw($model, $x)
+
+        println("  backward")
+        bw(back)  #warmup
+        @btime bw($back)
+
+        println("  forw and back")
+        fwbw(model, ps, x) # warmup
+        @btime fwbw($model, $ps, $x)
+    end
+end
diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl
index 5458037..c37f3d3 100644
--- a/benchmarks/runbenchmarks.jl
+++ b/benchmarks/runbenchmarks.jl
@@ -1,7 +1,7 @@
-using OperatorLearning, Flux
-
-versioninfo()
-include("bench_utils.jl")
-
-@info "Benchmark FourierLayer"
-include("benchFourierLayer.jl")
\ No newline at end of file
+using OperatorLearning, Flux
+
+versioninfo()
+include("bench_utils.jl")
+
+@info "Benchmark FourierLayer"
+include("benchFourierLayer.jl")
diff --git a/docs/make.jl b/docs/make.jl
index 12ec39e..83b5bf8 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,29 +1,28 @@
 using OperatorLearning
 using Documenter, DocumenterTools
 
-DocMeta.setdocmeta!(OperatorLearning, :DocTestSetup, :(using OperatorLearning); recursive=true)
+DocMeta.setdocmeta!(OperatorLearning, :DocTestSetup, :(using OperatorLearning); recursive = true)
 
 makedocs(;
-    modules=[OperatorLearning],
-    authors="Patrick Zimbrod <patrick.zimbrod@gmail.com> and contributors",
-    repo="https://github.com/pzimbrod/OperatorLearning.jl/blob/{commit}{path}#{line}",
-    sitename="OperatorLearning.jl",
-    format=Documenter.HTML(;
-        prettyurls=get(ENV, "CI", "false") == "true",
-        canonical="https://pzimbrod.github.io/OperatorLearning.jl",
-        assets=String[],
+    modules = [OperatorLearning],
+    authors = "Patrick Zimbrod <patrick.zimbrod@gmail.com> and contributors",
+    repo = "https://github.com/pzimbrod/OperatorLearning.jl/blob/{commit}{path}#{line}",
+    sitename = "OperatorLearning.jl",
+    format = Documenter.HTML(;
+        prettyurls = get(ENV, "CI", "false") == "true",
+        canonical = "https://pzimbrod.github.io/OperatorLearning.jl",
+        assets = String[]
     ),
-    pages=[
+    pages = [
         "Home" => "index.md",
-        "Examples" =>
-            ["Burgers Equation with FNO" => "examples/burgers_FNO.md",
-             "Burgers Equation with DeepONet" => "examples/burgers_DeepONet.md"],
+        "Examples" => ["Burgers Equation with FNO" => "examples/burgers_FNO.md",
+            "Burgers Equation with DeepONet" => "examples/burgers_DeepONet.md"],
         "Frequently Asked Questions" => "faq.md",
-        "Module Reference" => "reference.md",
-    ],
+        "Module Reference" => "reference.md"
+    ]
 )
 
 deploydocs(;
-    repo="github.com/pzimbrod/OperatorLearning.jl",
-    #devbranch="main",
+    repo = "github.com/pzimbrod/OperatorLearning.jl",
+#devbranch="main",
 )
diff --git a/docs/src/reference.md b/docs/src/reference.md
index cb13621..9b7c1f5 100644
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
@@ -1,3 +1,3 @@
 ```@autodocs
 Modules = [OperatorLearning]
-```
\ No newline at end of file
+```
diff --git a/examples/burgers_DeepONet.jl b/examples/burgers_DeepONet.jl
index 8bc0ed5..cf8bcf8 100644
--- a/examples/burgers_DeepONet.jl
+++ b/examples/burgers_DeepONet.jl
@@ -21,22 +21,22 @@ subsample = 2^3;
 # create the x training array, according to our desired grid size
 xtrain = vars["a"][1:1000, 1:subsample:end]' |> device;
 # create the x test array
-xtest = vars["a"][end-99:end, 1:subsample:end]' |> device;
+xtest = vars["a"][(end - 99):end, 1:subsample:end]' |> device;
 
 # Create the y training array
 ytrain = vars["u"][1:1000, 1:subsample:end] |> device;
 # Create the y test array
-ytest = vars["u"][end-99:end, 1:subsample:end] |> device;
+ytest = vars["u"][(end - 99):end, 1:subsample:end] |> device;
 
 # The data is missing grid data, so we create it
 # `collect` converts data type `range` into an array
-grid = collect(range(0, 1, length=1024))' |> device
+grid = collect(range(0, 1, length = 1024))' |> device
 
 # Create the DeepONet:
 # IC is given on grid of 1024 points, and we solve for a fixed time t in one
 # spatial dimension x, making the branch input of size 1024 and trunk size 1
 # We choose GeLU activation for both subnets
-model = DeepONet((1024,1024,1024),(1,1024,1024),gelu,gelu) |> device
+model = DeepONet((1024, 1024, 1024), (1, 1024, 1024), gelu, gelu) |> device
 
 # We use the ADAM optimizer for training
 learning_rate = 0.001
@@ -48,12 +48,12 @@ parameters = params(model)
 # The loss function
 # We can't use the "vanilla" implementation of the mse here since we have
 # two distinct inputs to our DeepONet, so we wrap them into a tuple
-loss(xtrain,ytrain,sensor) = Flux.Losses.mse(model(xtrain,sensor),ytrain)
+loss(xtrain, ytrain, sensor) = Flux.Losses.mse(model(xtrain, sensor), ytrain)
 
 # Define a callback function that gives some output during training
-evalcb() = @show(loss(xtest,ytest,grid))
+evalcb() = @show(loss(xtest, ytest, grid))
 # Print the callback only every 5 seconds
 throttled_cb = throttle(evalcb, 5)
 
 # Do the training loop
-Flux.@epochs 500 train!(loss, parameters, [(xtrain,ytrain,grid)], opt, cb = evalcb)
+Flux.@epochs 500 train!(loss, parameters, [(xtrain, ytrain, grid)], opt, cb = evalcb)
diff --git a/examples/burgers_FNO.jl b/examples/burgers_FNO.jl
index 896dda8..38bfdce 100644
--- a/examples/burgers_FNO.jl
+++ b/examples/burgers_FNO.jl
@@ -13,55 +13,57 @@ subsample = 2^3;
 # create the x training array, according to our desired grid size
 xtrain = vars["a"][1:1000, 1:subsample:end] |> device;
 # create the x test array
-xtest = vars["a"][end-99:end, 1:subsample:end] |> device;
+xtest = vars["a"][(end - 99):end, 1:subsample:end] |> device;
 
 # Create the y training array
 ytrain = vars["u"][1:1000, 1:subsample:end] |> device;
 # Create the y test array
-ytest = vars["u"][end-99:end, 1:subsample:end] |> device;
+ytest = vars["u"][(end - 99):end, 1:subsample:end] |> device;
 
 # The data is missing grid data, so we create it
 # `collect` converts data type `range` into an array
-grid = collect(range(0, 1, length=length(xtrain[1,:]))) |> device
+grid = collect(range(0, 1, length = length(xtrain[1, :]))) |> device
 
 # Merge the created grid with the data
 # Output has the dims: batch x grid points x 2  (a(x), x)
 # First, reshape the data to a 3D tensor,
 # Then, create a 3D tensor from the synthetic grid data
 # and concatenate them along the newly created 3rd dim
-xtrain = cat(reshape(xtrain,(1000,1024,1)),
-            reshape(repeat(grid,1000),(1000,1024,1));
-            dims=3) |> device
-ytrain = cat(reshape(ytrain,(1000,1024,1)),
-            reshape(repeat(grid,1000),(1000,1024,1));
-            dims=3) |> device
+xtrain = cat(reshape(xtrain, (1000, 1024, 1)),
+    reshape(repeat(grid, 1000), (1000, 1024, 1));
+    dims = 3) |> device
+ytrain = cat(reshape(ytrain, (1000, 1024, 1)),
+    reshape(repeat(grid, 1000), (1000, 1024, 1));
+    dims = 3) |> device
 # Same treatment with the test data
-xtest = cat(reshape(xtest,(100,1024,1)),
-            reshape(repeat(grid,100),(100,1024,1));
-            dims=3) |> device
-ytest = cat(reshape(ytest,(100,1024,1)),
-            reshape(repeat(grid,100),(100,1024,1));
-            dims=3) |> device
+xtest = cat(reshape(xtest, (100, 1024, 1)),
+    reshape(repeat(grid, 100), (100, 1024, 1));
+    dims = 3) |> device
+ytest = cat(reshape(ytest, (100, 1024, 1)),
+    reshape(repeat(grid, 100), (100, 1024, 1));
+    dims = 3) |> device
 
 # Our net wants the input in the form (2,grid,batch), though,
 # So we permute
-xtrain, xtest = permutedims(xtrain,(3,2,1)), permutedims(xtest,(3,2,1)) |> device
-ytrain, ytest = permutedims(ytrain,(3,2,1)), permutedims(ytest,(3,2,1)) |> device
+xtrain, xtest = permutedims(xtrain, (3, 2, 1)), permutedims(xtest, (3, 2, 1)) |> device
+ytrain, ytest = permutedims(ytrain, (3, 2, 1)), permutedims(ytest, (3, 2, 1)) |> device
 
 # Pass the data to the Flux DataLoader and give it a batch of 20
-train_loader = Flux.Data.DataLoader((xtrain, ytrain), batchsize=20, shuffle=true) |> device
-test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=20, shuffle=false) |> device
+train_loader = Flux.Data.DataLoader((xtrain, ytrain), batchsize = 20, shuffle = true) |>
+               device
+test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize = 20, shuffle = false) |>
+              device
 
 # Set up the Fourier Layer
 # 128 in- and outputs, batch size 20 as given above, grid size 1024
 # 16 modes to keep, σ activation on the gpu
-layer = FourierLayer(128,128,1024,16,gelu,bias_fourier=false) |> device
+layer = FourierLayer(128, 128, 1024, 16, gelu, bias_fourier = false) |> device
 
 # The whole architecture
 # linear transform into the latent space, 4 Fourier Layers,
 # then transform it back
-model = Chain(Dense(2,128;bias=false), layer, layer, layer, layer,
-                Dense(128,2;bias=false)) |> device
+model = Chain(Dense(2, 128; bias = false), layer, layer, layer, layer,
+    Dense(128, 2; bias = false)) |> device
 
 # We use the ADAM optimizer for training
 learning_rate = 0.001
@@ -71,10 +73,10 @@ opt = ADAM(learning_rate)
 parameters = params(model)
 
 # The loss function
-loss(x,y) = Flux.Losses.mse(model(x),y)
+loss(x, y) = Flux.Losses.mse(model(x), y)
 
 # Define a callback function that gives some output during training
-evalcb() = @show(loss(xtest,ytest))
+evalcb() = @show(loss(xtest, ytest))
 # Print the callback only every 5 seconds, 
 throttled_cb = throttle(evalcb, 5)
 
@@ -82,10 +84,10 @@ throttled_cb = throttle(evalcb, 5)
 Flux.@epochs 500 train!(loss, parameters, train_loader, opt, cb = throttled_cb)
 
 # Accuracy metrics
-val_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=1, shuffle=false) |> device
+val_loader = Flux.Data.DataLoader((xtest, ytest), batchsize = 1, shuffle = false) |> device
 loss = 0.0 |> device
 
-for (x,y) in val_loader
+for (x, y) in val_loader
     ŷ = model(x)
-    loss += Flux.Losses.mse(ŷ,y)
+    loss += Flux.Losses.mse(ŷ, y)
 end
diff --git a/src/ComplexWeights.jl b/src/ComplexWeights.jl
index e504668..f3c1b21 100644
--- a/src/ComplexWeights.jl
+++ b/src/ComplexWeights.jl
@@ -4,7 +4,9 @@ cglorot_uniform([rng=GLOBAL_RNG], dims...)
 A modification of the `glorot_uniform` function provided by `Flux` to accommodate Complex numbers.
 This is necessary since the parameters of the global convolution operator in the Fourier Layer generally has complex weights.
 """
-cglorot_uniform(rng::AbstractRNG, dims...) = (rand(rng, ComplexF32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+function cglorot_uniform(rng::AbstractRNG, dims...)
+    (rand(rng, ComplexF32, dims...) .- 0.5f0) .* sqrt(24.0f0 / sum(nfan(dims...)))
+end
 cglorot_uniform(dims...) = cglorot_uniform(Random.GLOBAL_RNG, dims...)
 cglorot_uniform(rng::AbstractRNG) = (dims...) -> cglorot_uniform(rng, dims...)
 
@@ -14,6 +16,8 @@ cglorot_normal([rng=GLOBAL_RNG], dims...)
 A modification of the `glorot_normal` function provided by `Flux` to accommodate Complex numbers.
 This is necessary since the parameters of the global convolution operator in the Fourier Layer generally has complex weights.
 """
-cglorot_normal(rng::AbstractRNG, dims...) = randn(rng, ComplexF32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
+function cglorot_normal(rng::AbstractRNG, dims...)
+    randn(rng, ComplexF32, dims...) .* sqrt(2.0f0 / sum(nfan(dims...)))
+end
 cglorot_normal(dims...) = cglorot_normal(Random.GLOBAL_RNG, dims...)
-cglorot_normal(rng::AbstractRNG) = (dims...) -> cglorot_normal(rng, dims...)
\ No newline at end of file
+cglorot_normal(rng::AbstractRNG) = (dims...) -> cglorot_normal(rng, dims...)
diff --git a/src/DeepONet.jl b/src/DeepONet.jl
index 2580bb3..9223bd5 100644
--- a/src/DeepONet.jl
+++ b/src/DeepONet.jl
@@ -77,20 +77,19 @@ end
 
 # Declare the function that assigns Weights and biases to the layer
 function DeepONet(architecture_branch::Tuple, architecture_trunk::Tuple,
-                        act_branch = identity, act_trunk = identity;
-                        init_branch = Flux.glorot_uniform,
-                        init_trunk = Flux.glorot_uniform,
-                        bias_branch=true, bias_trunk=true)
-
+        act_branch = identity, act_trunk = identity;
+        init_branch = Flux.glorot_uniform,
+        init_trunk = Flux.glorot_uniform,
+        bias_branch = true, bias_trunk = true)
     @assert architecture_branch[end] == architecture_trunk[end] "Branch and Trunk net must share the same amount of nodes in the last layer. Otherwise Σᵢ bᵢⱼ tᵢₖ won't work."
 
     # To construct the subnets we use the helper function in subnets.jl
     # Initialize the branch net
     branch_net = construct_subnet(architecture_branch, act_branch;
-                                    init=init_branch, bias=bias_branch)
+        init = init_branch, bias = bias_branch)
     # Initialize the trunk net
     trunk_net = construct_subnet(architecture_trunk, act_trunk;
-                                    init=init_trunk, bias=bias_trunk)
+        init = init_trunk, bias = bias_trunk)
 
     return DeepONet(branch_net, trunk_net)
 end
@@ -113,13 +112,14 @@ function (a::DeepONet)(x::AbstractArray, y::AbstractVecOrMat)
 end
 
 # Sensors stay the same and shouldn't be batched
-(a::DeepONet)(x::AbstractArray, y::AbstractArray) = 
-  throw(ArgumentError("Sensor locations fed to trunk net can't be batched."))
+function (a::DeepONet)(x::AbstractArray, y::AbstractArray)
+    throw(ArgumentError("Sensor locations fed to trunk net can't be batched."))
+end
 
 # Print nicely
 function Base.show(io::IO, l::DeepONet)
-    print(io, "DeepONet with\nbranch net: (",l.branch_net)
+    print(io, "DeepONet with\nbranch net: (", l.branch_net)
     print(io, ")\n")
     print(io, "Trunk net: (", l.trunk_net)
     print(io, ")\n")
-end
\ No newline at end of file
+end
diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 43e6648..468c8e3 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -1,136 +1,139 @@
-"""
-`FourierLayer(in, out, grid, modes, σ=identity, init=glorot_uniform)`
-`FourierLayer(Wf::AbstractArray, Wl::AbstractArray, [bias_f, bias_l, σ])`
-
-Create a Layer of the Fourier Neural Operator as proposed by Li et al.
-arXiv: 2010.08895
-
-The layer does a fourier transform on the grid dimension of the input array,
-filters higher modes out by the weight matrix and transforms it to the
-specified output dimension such that In x M x N -> Out x M x N.
-The output though only contains the relevant Fourier modes with the rest padded to zero
-in the last axis as a result of the filtering.
-
-The input `x` should be a rank 3 tensor of shape
-(num parameters (`in`) x num grid points (`grid`) x batch size (`batch`))
-The output `y` will be a rank 3 tensor of shape
-(`out` x num grid points (`grid`) x batch size (`batch`))
-
-You can specify biases for the paths as you like, though the convolutional path is
-originally not intended to perform an affine transformation.
-
-# Examples
-Say you're considering a 1D diffusion problem on a 64 point grid. The input is comprised
-of the grid points as well as the IC at this point.
-The data consists of 200 instances of the solution.
-Beforehand we convert the two input channels into a higher-dimensional latent space with 128 nodes by using a regular `Dense` layer.
-So the input takes the dimension `128 x 64 x 200`.
-The output would be the diffused variable at a later time, which initially makes the output of the form `128 x 64 x 200` as well. Finally, we have to squeeze this high-dimensional ouptut into the one quantity of interest again by using a `Dense` layer.
-
-We wish to only keep the first 16 modes of the input and work with the classic sigmoid function as activation.
-
-So we would have:
-
-```julia
-model = FourierLayer(128, 128, 100, 16, σ)
-```
-"""
-struct FourierLayer{F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat,Bf,Bl}
-    # F: Activation, Tc/Tr: Complex/Real eltype
-    Wf::AbstractArray{Tc,3}
-    Wl::AbstractMatrix{Tr}
-    grid::Int
-    σ::F
-    λ::Int
-    bf::Bf
-    bl::Bl
-    # Constructor for the entire fourier layer
-    function FourierLayer(
-        Wf::AbstractArray{Tc,3}, Wl::AbstractMatrix{Tr}, 
-        grid::Int, σ::F = identity,
-        λ::Int = 12, bf = true, bl = true) where
-        {F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
-
-        # create the biases with one singleton dimension
-        bf = Flux.create_bias(Wf, bf, 1, size(Wf,2), size(Wf,3))
-        bl = Flux.create_bias(Wl, bl, 1, size(Wl,1), grid)
-        new{F,Tc,Tr,typeof(bf),typeof(bl)}(Wf, Wl, grid, σ, λ, bf, bl)
-    end
-end
-
-# Declare the function that assigns Weights and biases to the layer
-# `in` and `out` refer to the dimensionality of the number of parameters
-# `modes` specifies the number of modes not to be filtered out
-# `grid` specifies the number of grid points in the data
-function FourierLayer(in::Integer, out::Integer, grid::Integer, modes = 12,
-                        σ = identity; initf = cglorot_uniform, initl = Flux.glorot_uniform,
-                        bias_fourier=true, bias_linear=true)
-
-    # Initialize Fourier weight matrix (only with relevant modes)
-    Wf = initf(in, out, modes)
-    # Make sure filtering works
-    @assert modes <= floor(Int, grid/2 + 1) "Specified modes exceed allowed maximum. 
-    The number of modes to filter must be smaller than N/2 + 1"
-    # Pad the fourier weight matrix with additional zeros
-    Wf = pad_zeros(Wf, (0, floor(Int, grid/2 + 1) - modes), dims=3)
-
-    # Initialize Linear weight matrix
-    Wl = initl(out, in)
-
-    # Pass the bias bools
-    bf = bias_fourier
-    bl = bias_linear
-
-    # Pass the modes for output
-    λ = modes
-
-    return FourierLayer(Wf, Wl, grid, σ, λ, bf, bl)
-end
-
-# Only train the weight array with non-zero modes
-Flux.@functor FourierLayer 
-Flux.trainable(a::FourierLayer) = (a.Wf[:,:,1:a.λ], a.Wl, 
-                                typeof(a.bf) != Flux.Zeros ? a.bf[:,:,1:a.λ] : nothing,
-                                typeof(a.bl) != Flux.Zeros ? a.bl : nothing)
-
-# The actual layer that does stuff
-function (a::FourierLayer)(x::AbstractArray)
-    # Assign the parameters
-    Wf, Wl, bf, bl, σ, = a.Wf, a.Wl, a.bf, a.bl, NNlib.fast_act(a.σ, x)
-
-    # Do a permutation: DataLoader requires batch to be the last dim
-    # for the rest, it's more convenient to have it in the first one
-    xp = permutedims(x, [3,1,2])
-
-    # The linear path
-    # x -> Wl
-    @ein linear[batch, out, grid] := Wl[out, in] * xp[batch, in, grid]
-    linear .+ bl
-
-    # The convolution path
-    # x -> 𝔉 -> Wf -> i𝔉
-    # Do the Fourier transform (FFT) along the grid dimension of the input and
-    # Multiply the weight matrix with the input using batched multiplication
-    # We need to permute the input to (channel,batch,grid), otherwise batching won't work
-    @ein 𝔉[batch, out, grid] := Wf[in, out, grid] * rfft(xp, 3)[batch, in, grid]
-    𝔉 .+ bf
-
-    # Do the inverse transform
-    # We need to permute back to match the shape of the linear path
-    i𝔉 = irfft(𝔉, size(xp,3),3)
-
-    # Return the activated sum
-    return permutedims(σ.(linear + i𝔉), [2,3,1])
-end
-
-# Print nicely
-function Base.show(io::IO, l::FourierLayer)
-    print(io, "FourierLayer with\nConvolution path: (", size(l.Wf, 2), ", ",
-            size(l.Wf, 1), ", ", size(l.Wf, 3))
-    print(io, ")\n")
-    print(io, "Linear path: (", size(l.Wl, 2), ", ", size(l.Wl, 1))
-    print(io, ")\n")
-    print(io, "Fourier modes: ", l.λ)
-    print(io, "\n")
-    l.σ == identity || print(io, "Activation: ", l.σ)
-end
\ No newline at end of file
+"""
+`FourierLayer(in, out, grid, modes, σ=identity, init=glorot_uniform)`
+`FourierLayer(Wf::AbstractArray, Wl::AbstractArray, [bias_f, bias_l, σ])`
+
+Create a Layer of the Fourier Neural Operator as proposed by Li et al.
+arXiv: 2010.08895
+
+The layer does a fourier transform on the grid dimension of the input array,
+filters higher modes out by the weight matrix and transforms it to the
+specified output dimension such that In x M x N -> Out x M x N.
+The output though only contains the relevant Fourier modes with the rest padded to zero
+in the last axis as a result of the filtering.
+
+The input `x` should be a rank 3 tensor of shape
+(num parameters (`in`) x num grid points (`grid`) x batch size (`batch`))
+The output `y` will be a rank 3 tensor of shape
+(`out` x num grid points (`grid`) x batch size (`batch`))
+
+You can specify biases for the paths as you like, though the convolutional path is
+originally not intended to perform an affine transformation.
+
+# Examples
+Say you're considering a 1D diffusion problem on a 64 point grid. The input is comprised
+of the grid points as well as the IC at this point.
+The data consists of 200 instances of the solution.
+Beforehand we convert the two input channels into a higher-dimensional latent space with 128 nodes by using a regular `Dense` layer.
+So the input takes the dimension `128 x 64 x 200`.
+The output would be the diffused variable at a later time, which initially makes the output of the form `128 x 64 x 200` as well. Finally, we have to squeeze this high-dimensional ouptut into the one quantity of interest again by using a `Dense` layer.
+
+We wish to only keep the first 16 modes of the input and work with the classic sigmoid function as activation.
+
+So we would have:
+
+```julia
+model = FourierLayer(128, 128, 100, 16, σ)
+```
+"""
+struct FourierLayer{F, Tc <: Complex{<:AbstractFloat}, Tr <: AbstractFloat, Bf, Bl}
+    # F: Activation, Tc/Tr: Complex/Real eltype
+    Wf::AbstractArray{Tc, 3}
+    Wl::AbstractMatrix{Tr}
+    grid::Int
+    σ::F
+    λ::Int
+    bf::Bf
+    bl::Bl
+    # Constructor for the entire fourier layer
+    function FourierLayer(
+            Wf::AbstractArray{Tc, 3}, Wl::AbstractMatrix{Tr},
+            grid::Int, σ::F = identity,
+            λ::Int = 12, bf = true,
+            bl = true) where
+            {F, Tc <: Complex{<:AbstractFloat}, Tr <: AbstractFloat}
+
+        # create the biases with one singleton dimension
+        bf = Flux.create_bias(Wf, bf, 1, size(Wf, 2), size(Wf, 3))
+        bl = Flux.create_bias(Wl, bl, 1, size(Wl, 1), grid)
+        new{F, Tc, Tr, typeof(bf), typeof(bl)}(Wf, Wl, grid, σ, λ, bf, bl)
+    end
+end
+
+# Declare the function that assigns Weights and biases to the layer
+# `in` and `out` refer to the dimensionality of the number of parameters
+# `modes` specifies the number of modes not to be filtered out
+# `grid` specifies the number of grid points in the data
+function FourierLayer(in::Integer, out::Integer, grid::Integer, modes = 12,
+        σ = identity; initf = cglorot_uniform, initl = Flux.glorot_uniform,
+        bias_fourier = true, bias_linear = true)
+
+    # Initialize Fourier weight matrix (only with relevant modes)
+    Wf = initf(in, out, modes)
+    # Make sure filtering works
+    @assert modes <= floor(Int, grid/2 + 1) "Specified modes exceed allowed maximum. 
+    The number of modes to filter must be smaller than N/2 + 1"
+    # Pad the fourier weight matrix with additional zeros
+    Wf = pad_zeros(Wf, (0, floor(Int, grid/2 + 1) - modes), dims = 3)
+
+    # Initialize Linear weight matrix
+    Wl = initl(out, in)
+
+    # Pass the bias bools
+    bf = bias_fourier
+    bl = bias_linear
+
+    # Pass the modes for output
+    λ = modes
+
+    return FourierLayer(Wf, Wl, grid, σ, λ, bf, bl)
+end
+
+# Only train the weight array with non-zero modes
+Flux.@functor FourierLayer
+function Flux.trainable(a::FourierLayer)
+    (a.Wf[:, :, 1:a.λ], a.Wl,
+        typeof(a.bf) != Flux.Zeros ? a.bf[:, :, 1:a.λ] : nothing,
+        typeof(a.bl) != Flux.Zeros ? a.bl : nothing)
+end
+
+# The actual layer that does stuff
+function (a::FourierLayer)(x::AbstractArray)
+    # Assign the parameters
+    Wf, Wl, bf, bl, σ, = a.Wf, a.Wl, a.bf, a.bl, NNlib.fast_act(a.σ, x)
+
+    # Do a permutation: DataLoader requires batch to be the last dim
+    # for the rest, it's more convenient to have it in the first one
+    xp = permutedims(x, [3, 1, 2])
+
+    # The linear path
+    # x -> Wl
+    @ein linear[batch, out, grid] := Wl[out, in] * xp[batch, in, grid]
+    linear .+ bl
+
+    # The convolution path
+    # x -> 𝔉 -> Wf -> i𝔉
+    # Do the Fourier transform (FFT) along the grid dimension of the input and
+    # Multiply the weight matrix with the input using batched multiplication
+    # We need to permute the input to (channel,batch,grid), otherwise batching won't work
+    @ein 𝔉[batch, out, grid] := Wf[in, out, grid] * rfft(xp, 3)[batch, in, grid]
+    𝔉 .+ bf
+
+    # Do the inverse transform
+    # We need to permute back to match the shape of the linear path
+    i𝔉 = irfft(𝔉, size(xp, 3), 3)
+
+    # Return the activated sum
+    return permutedims(σ.(linear + i𝔉), [2, 3, 1])
+end
+
+# Print nicely
+function Base.show(io::IO, l::FourierLayer)
+    print(io, "FourierLayer with\nConvolution path: (", size(l.Wf, 2), ", ",
+        size(l.Wf, 1), ", ", size(l.Wf, 3))
+    print(io, ")\n")
+    print(io, "Linear path: (", size(l.Wl, 2), ", ", size(l.Wl, 1))
+    print(io, ")\n")
+    print(io, "Fourier modes: ", l.λ)
+    print(io, "\n")
+    l.σ == identity || print(io, "Activation: ", l.σ)
+end
diff --git a/src/OperatorLearning.jl b/src/OperatorLearning.jl
index 5e52209..866a397 100644
--- a/src/OperatorLearning.jl
+++ b/src/OperatorLearning.jl
@@ -1,21 +1,21 @@
-module OperatorLearning
-
-using Base: Integer, ident_cmp, Float32
-using CUDA
-using Flux
-using FFTW
-using FFTW: assert_applicable, unsafe_execute!, FORWARD, BACKWARD, rFFTWPlan
-using Random
-using Random: AbstractRNG
-using Flux: nfan, glorot_uniform, batch
-using OMEinsum
-
-export FourierLayer, DeepONet
-
-include("FourierLayer.jl")
-include("DeepONet.jl")
-include("ComplexWeights.jl")
-include("batched.jl")
-include("subnets.jl")
-
-end # module
+module OperatorLearning
+
+using Base: Integer, ident_cmp, Float32
+using CUDA
+using Flux
+using FFTW
+using FFTW: assert_applicable, unsafe_execute!, FORWARD, BACKWARD, rFFTWPlan
+using Random
+using Random: AbstractRNG
+using Flux: nfan, glorot_uniform, batch
+using OMEinsum
+
+export FourierLayer, DeepONet
+
+include("FourierLayer.jl")
+include("DeepONet.jl")
+include("ComplexWeights.jl")
+include("batched.jl")
+include("subnets.jl")
+
+end # module
diff --git a/src/batched.jl b/src/batched.jl
index 1aff86d..2696e4e 100644
--- a/src/batched.jl
+++ b/src/batched.jl
@@ -8,20 +8,20 @@ For `mul!`, `FFTW.jl` already provides an implementation. However, we need to lo
 """
 
 # Extension for CPU Arrays
-for (Tr,Tc) in ((:Float32,:(Complex{Float32})),(:Float64,:(Complex{Float64})))
+for (Tr, Tc) in ((:Float32, :(Complex{Float32})), (:Float64, :(Complex{Float64})))
     # Note: use $FORWARD and $BACKWARD below because of issue #9775
     @eval begin
-        function NNlib.batched_mul!(y::StridedArray{$Tc}, p::rFFTWPlan{$Tr,$FORWARD}, x::StridedArray{$Tr})
-            assert_applicable(p, x[:,:,1], y[:,:,1]) # no need to check every batch dim
-            @inbounds for k ∈ 1:size(y,3)
-                @views unsafe_execute!(p, x[:,:,k], y[:,:,k])
+        function NNlib.batched_mul!(y::StridedArray{$Tc}, p::rFFTWPlan{$Tr, $FORWARD}, x::StridedArray{$Tr})
+            assert_applicable(p, x[:, :, 1], y[:, :, 1]) # no need to check every batch dim
+            @inbounds for k in 1:size(y, 3)
+                @views unsafe_execute!(p, x[:, :, k], y[:, :, k])
             end
             return y
         end
-        function NNlib.batched_mul!(y::StridedArray{$Tr}, p::rFFTWPlan{$Tc,$BACKWARD}, x::StridedArray{$Tc})
-            assert_applicable(p, x[:,:,1], y[:,:,1]) # no need to check every batch dim
-            @inbounds for k ∈ 1:size(y,3)
-                @views unsafe_execute!(p, x[:,:,k], y[:,:,k])
+        function NNlib.batched_mul!(y::StridedArray{$Tr}, p::rFFTWPlan{$Tc, $BACKWARD}, x::StridedArray{$Tc})
+            assert_applicable(p, x[:, :, 1], y[:, :, 1]) # no need to check every batch dim
+            @inbounds for k in 1:size(y, 3)
+                @views unsafe_execute!(p, x[:, :, k], y[:, :, k])
             end
             return y
         end
@@ -29,11 +29,11 @@ for (Tr,Tc) in ((:Float32,:(Complex{Float32})),(:Float64,:(Complex{Float64})))
 end
 
 # Methods for GPU Arrays, borrowed from CUDA.jl -> fft.jl #490
-function NNlib.batched_mul!(y::DenseCuArray{Ty}, p::CuFFTPlan{T,K,false}, x::DenseCuArray{T}
-                           ) where {Ty,T,K}
-    CUFFT.assert_applicable(p, x[:,:,1], y[:,:,1])
-    @inbounds for k ∈ 1:size(y,3)
-        @views CUFFT.unsafe_execute!(p, x[:,:,k] ,y[:,:,k])
+function NNlib.batched_mul!(y::DenseCuArray{Ty}, p::CuFFTPlan{T, K, false}, x::DenseCuArray{T}
+) where {Ty, T, K}
+    CUFFT.assert_applicable(p, x[:, :, 1], y[:, :, 1])
+    @inbounds for k in 1:size(y, 3)
+        @views CUFFT.unsafe_execute!(p, x[:, :, k], y[:, :, k])
     end
     return y
 end
diff --git a/src/subnets.jl b/src/subnets.jl
index 82af33b..93d271c 100644
--- a/src/subnets.jl
+++ b/src/subnets.jl
@@ -24,16 +24,16 @@ julia> model([2,1])
 ```
 """
 function construct_subnet(architecture::Tuple, σ = identity;
-                          init=Flux.glorot_uniform, bias=true)
+        init = Flux.glorot_uniform, bias = true)
     # First, create an array that contains all Dense layers independently
     # Given n-element architecture constructs n-1 layers
     layers = Array{Flux.Dense}(undef, length(architecture)-1)
-    @inbounds for i ∈ 2:length(architecture)
-      layers[i-1] = Flux.Dense(architecture[i-1], architecture[i], σ;
-                                init=init, bias=bias)
+    @inbounds for i in 2:length(architecture)
+        layers[i - 1] = Flux.Dense(architecture[i - 1], architecture[i], σ;
+            init = init, bias = bias)
     end
 
     # Concatenate the layers to a string, chain them and parse them into
     # the Flux Chain constructor syntax
-    return Meta.parse("Chain("*join(layers,",")*")") |> eval
-end
\ No newline at end of file
+    return Meta.parse("Chain("*join(layers, ",")*")") |> eval
+end
diff --git a/test/complexweights.jl b/test/complexweights.jl
index e274449..bfc3988 100644
--- a/test/complexweights.jl
+++ b/test/complexweights.jl
@@ -1,13 +1,13 @@
-using Test, Random, Flux
-
-@testset "Weights" begin
-    @testset "uniform" begin
-        @test size(OperatorLearning.cglorot_uniform(128,64,10)) == (128, 64, 10)
-        @test eltype(OperatorLearning.cglorot_uniform(128,64,10)) == ComplexF32
-    end
-
-    @testset "normal" begin
-        @test size(OperatorLearning.cglorot_normal(128,64,10)) == (128, 64, 10)
-        @test eltype(OperatorLearning.cglorot_normal(128,64,10)) == ComplexF32
-    end
-end
\ No newline at end of file
+using Test, Random, Flux
+
+@testset "Weights" begin
+    @testset "uniform" begin
+        @test size(OperatorLearning.cglorot_uniform(128, 64, 10)) == (128, 64, 10)
+        @test eltype(OperatorLearning.cglorot_uniform(128, 64, 10)) == ComplexF32
+    end
+
+    @testset "normal" begin
+        @test size(OperatorLearning.cglorot_normal(128, 64, 10)) == (128, 64, 10)
+        @test eltype(OperatorLearning.cglorot_normal(128, 64, 10)) == ComplexF32
+    end
+end
diff --git a/test/deeponet.jl b/test/deeponet.jl
index 0f3a482..22b9456 100644
--- a/test/deeponet.jl
+++ b/test/deeponet.jl
@@ -4,14 +4,18 @@ using Test, Random, Flux
     @testset "dimensions" begin
         # Test the proper construction
         # Branch net
-        @test size(DeepONet((32,64,72), (24,48,72), σ, tanh).branch_net.layers[end].weight) == (72,64)
-        @test size(DeepONet((32,64,72), (24,48,72), σ, tanh).branch_net.layers[end].bias) == (72,)
+        @test size(DeepONet((32, 64, 72), (24, 48, 72), σ, tanh).branch_net.layers[end].weight) ==
+              (72, 64)
+        @test size(DeepONet((32, 64, 72), (24, 48, 72), σ, tanh).branch_net.layers[end].bias) ==
+              (72,)
         # Trunk net
-        @test size(DeepONet((32,64,72), (24,48,72), σ, tanh).trunk_net.layers[end].weight) == (72,48)
-        @test size(DeepONet((32,64,72), (24,48,72), σ, tanh).trunk_net.layers[end].bias) == (72,)
+        @test size(DeepONet((32, 64, 72), (24, 48, 72), σ, tanh).trunk_net.layers[end].weight) ==
+              (72, 48)
+        @test size(DeepONet((32, 64, 72), (24, 48, 72), σ, tanh).trunk_net.layers[end].bias) ==
+              (72,)
     end
 
     # Accept only Int as architecture parameters
-    @test_throws MethodError DeepONet((32.5,64,72), (24,48,72), σ, tanh)
-    @test_throws MethodError DeepONet((32,64,72), (24.1,48,72))
-end
\ No newline at end of file
+    @test_throws MethodError DeepONet((32.5, 64, 72), (24, 48, 72), σ, tanh)
+    @test_throws MethodError DeepONet((32, 64, 72), (24.1, 48, 72))
+end
diff --git a/test/fourierlayer.jl b/test/fourierlayer.jl
index a26af98..98bd987 100644
--- a/test/fourierlayer.jl
+++ b/test/fourierlayer.jl
@@ -1,31 +1,31 @@
-using Test, Random, Flux
-
-@testset "FourierLayer" begin
-    @testset "dimensions" begin
-        # Test the proper construction
-        @test size(FourierLayer(128, 64, 100, 20).Wf) == (128, 64, 51)
-        @test size(FourierLayer(128, 64, 100, 20).Wl) == (64, 128)
-        @test size(FourierLayer(128, 64, 100, 20).bf) == (1, 64, 51)
-        @test size(FourierLayer(128, 64, 100, 20).bl) == (1, 64, 100)
-    end
-
-    # Test proper parameter assignment
-    # We only use a subset of the weight tensors for training
-    @testset "parameters" begin
-        # Wf
-        @test size(Flux.params(FourierLayer(128, 64, 100, 20))[1]) == (128, 64, 20)
-        # Wl
-        @test size(Flux.params(FourierLayer(128, 64, 100, 20))[2]) == (64, 128)
-        # bf
-        @test size(Flux.params(FourierLayer(128, 64, 100, 20))[3]) == (1, 64, 20)
-        # bl
-        @test size(Flux.params(FourierLayer(128, 64, 100, 20))[4]) == (1, 64, 100)
-    end
-
-    # Accept only Int as architecture parameters
-    @test_throws MethodError FourierLayer(128.5, 64, 100, 20)
-    @test_throws MethodError FourierLayer(128.5, 64, 100, 20, tanh)
-    # Test max amount of modes
-    @test_throws AssertionError FourierLayer(100, 100, 100, 60, σ)
-    @test_throws AssertionError FourierLayer(100, 100, 100, 60)
-end
+using Test, Random, Flux
+
+@testset "FourierLayer" begin
+    @testset "dimensions" begin
+        # Test the proper construction
+        @test size(FourierLayer(128, 64, 100, 20).Wf) == (128, 64, 51)
+        @test size(FourierLayer(128, 64, 100, 20).Wl) == (64, 128)
+        @test size(FourierLayer(128, 64, 100, 20).bf) == (1, 64, 51)
+        @test size(FourierLayer(128, 64, 100, 20).bl) == (1, 64, 100)
+    end
+
+    # Test proper parameter assignment
+    # We only use a subset of the weight tensors for training
+    @testset "parameters" begin
+        # Wf
+        @test size(Flux.params(FourierLayer(128, 64, 100, 20))[1]) == (128, 64, 20)
+        # Wl
+        @test size(Flux.params(FourierLayer(128, 64, 100, 20))[2]) == (64, 128)
+        # bf
+        @test size(Flux.params(FourierLayer(128, 64, 100, 20))[3]) == (1, 64, 20)
+        # bl
+        @test size(Flux.params(FourierLayer(128, 64, 100, 20))[4]) == (1, 64, 100)
+    end
+
+    # Accept only Int as architecture parameters
+    @test_throws MethodError FourierLayer(128.5, 64, 100, 20)
+    @test_throws MethodError FourierLayer(128.5, 64, 100, 20, tanh)
+    # Test max amount of modes
+    @test_throws AssertionError FourierLayer(100, 100, 100, 60, σ)
+    @test_throws AssertionError FourierLayer(100, 100, 100, 60)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0294fd7..8eebe90 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,27 +1,27 @@
-using Pkg
-const LONGER_TESTS = false
-const GROUP = get(ENV, "GROUP", "All")
-
-using OperatorLearning
-using Test
-using Random
-
-Random.seed!(0)
-
-if GROUP == "All" || GROUP == "Core"
-@testset "FourierLayer" begin
-    include("fourierlayer.jl")
-end
-
-@testset "DeepONet" begin
-    include("deeponet.jl")
-end
-
-@testset "Weights" begin
-    include("complexweights.jl")
-end
-end
-
-if GROUP == "GPU"
-    # Add GPU Tests Here
-end
+using Pkg
+const LONGER_TESTS = false
+const GROUP = get(ENV, "GROUP", "All")
+
+using OperatorLearning
+using Test
+using Random
+
+Random.seed!(0)
+
+if GROUP == "All" || GROUP == "Core"
+    @testset "FourierLayer" begin
+        include("fourierlayer.jl")
+    end
+
+    @testset "DeepONet" begin
+        include("deeponet.jl")
+    end
+
+    @testset "Weights" begin
+        include("complexweights.jl")
+    end
+end
+
+if GROUP == "GPU"
+    # Add GPU Tests Here
+end