From cd72219d0b25ab87ce5be0805fcfed749b7f1edf Mon Sep 17 00:00:00 2001
From: jan <jan@jan.jan>
Date: Fri, 5 Aug 2016 18:40:27 -0700
Subject: [PATCH 01/37] Revert "lookup tables" (accidental commit)

This reverts commit 7bf7a3d80c7971482969984b1b0fe0016d212024.
---
 fdtd/boundary.py   | 41 ++++++++++++++++-------------------------
 fdtd/simulation.py |  4 ++--
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/fdtd/boundary.py b/fdtd/boundary.py
index 31dfac8..bb8dbdf 100644
--- a/fdtd/boundary.py
+++ b/fdtd/boundary.py
@@ -121,27 +121,15 @@ def cpml(direction: int,
     p0e, p1e = par(xe)
     p0h, p1h = par(xh)
 
-    def create_table(name, xs, type='float'):
-       s = type + ''' {name}(int x) {{
-    switch (x) {{
-'''.format(name=name)
-       for i, x in enumerate(xs):
-           s += '''        case {i}: return {x};
-'''.format(i=i, x=x)
-       s += '''
-    }
-}'''
-       return s
-
     vals = {'r': r,
             'u': uv[0],
             'v': uv[1],
             'np':   np,
             'th':   thickness,
-            #'p0e': ', '.join((str(x) for x in p0e)),
-            #'p1e': ', '.join((str(x) for x in p1e)),
-            #'p0h': ', '.join((str(x) for x in p0h)),
-            #'p1h': ', '.join((str(x) for x in p1h)),
+            'p0e': ', '.join((str(x) for x in p0e)),
+            'p1e': ', '.join((str(x) for x in p1e)),
+            'p0h': ', '.join((str(x) for x in p0h)),
+            'p1h': ', '.join((str(x) for x in p1h)),
             'se': '-+'[direction % 2],
             'sh': '+-'[direction % 2]}
 
@@ -161,16 +149,24 @@ if ( (s{r} - 1) > {r} && {r} > (s{r} - 1) - ({th} + 1) ) {{
         raise Exception('Bad polarity (=0)')
 
     code_e = """
-    Psi_{r}{np}_E{u}[ip] = p0e(ir) * Psi_{r}{np}_E{u}[ip] + p1e(ir) * (H{v}[i] - H{v}[i-di{r}]);
-    Psi_{r}{np}_E{v}[ip] = p0e(ir) * Psi_{r}{np}_E{v}[ip] + p1e(ir) * (H{u}[i] - H{u}[i-di{r}]);
+    // pml parameters:
+    const float p0[{th}] = {{ {p0e} }};
+    const float p1[{th}] = {{ {p1e} }};
+
+    Psi_{r}{np}_E{u}[ip] = p0[ir] * Psi_{r}{np}_E{u}[ip] + p1[ir] * (H{v}[i] - H{v}[i-di{r}]);
+    Psi_{r}{np}_E{v}[ip] = p0[ir] * Psi_{r}{np}_E{v}[ip] + p1[ir] * (H{u}[i] - H{u}[i-di{r}]);
 
     E{u}[i] {se}= dt / eps{u}[i] * Psi_{r}{np}_E{u}[ip];
     E{v}[i] {sh}= dt / eps{v}[i] * Psi_{r}{np}_E{v}[ip];
 }}
 """
     code_h = """
-    Psi_{r}{np}_H{u}[ip] = p0h(ir) * Psi_{r}{np}_H{u}[ip] + p1h(ir) * (E{v}[i+di{r}] - E{v}[i]);
-    Psi_{r}{np}_H{v}[ip] = p0h(ir) * Psi_{r}{np}_H{v}[ip] + p1h(ir) * (E{u}[i+di{r}] - E{u}[i]);
+    // pml parameters:
+    const float p0[{th}] = {{ {p0h} }};
+    const float p1[{th}] = {{ {p1h} }};
+
+    Psi_{r}{np}_H{u}[ip] = p0[ir] * Psi_{r}{np}_H{u}[ip] + p1[ir] * (E{v}[i+di{r}] - E{v}[i]);
+    Psi_{r}{np}_H{v}[ip] = p0[ir] * Psi_{r}{np}_H{v}[ip] + p1[ir] * (E{u}[i+di{r}] - E{u}[i]);
 
     H{u}[i] {sh}= dt * Psi_{r}{np}_H{u}[ip];
     H{v}[i] {se}= dt * Psi_{r}{np}_H{v}[ip];
@@ -184,11 +180,6 @@ if ( (s{r} - 1) > {r} && {r} > (s{r} - 1) - ({th} + 1) ) {{
                   'Psi_{r}{np}_E{v}'.format(**vals)],
         'psi_H': ['Psi_{r}{np}_H{u}'.format(**vals),
                   'Psi_{r}{np}_H{v}'.format(**vals)],
-        'tables': create_table('p0e', p0e) + \
-                  create_table('p1e', p1e) + \
-                  create_table('p0h', p0h) + \
-                  create_table('p1h', p1h)
-
     }
 
     return pml_data
diff --git a/fdtd/simulation.py b/fdtd/simulation.py
index d3a33df..227dcb0 100644
--- a/fdtd/simulation.py
+++ b/fdtd/simulation.py
@@ -174,8 +174,8 @@ class Simulation(object):
         pml_E_args = ', '.join(E_args + H_args + dt_arg + eps_args + arglist_E)
         pml_H_args = ', '.join(E_args + H_args + dt_arg + arglist_H)
 
-        pml_E = ElementwiseKernel(self.context, preamble=pml_data['tables'], arguments=pml_E_args, operation=pml_E_source)
-        pml_H = ElementwiseKernel(self.context, preamble=pml_data['tables'], arguments=pml_H_args, operation=pml_H_source)
+        pml_E = ElementwiseKernel(self.context, arguments=pml_E_args, operation=pml_E_source)
+        pml_H = ElementwiseKernel(self.context, arguments=pml_H_args, operation=pml_H_source)
 
         self.cpml_E = lambda e: pml_E(*self.E, *self.H, self.dt, *self.eps, *psi_E, wait_for=e)
         self.cpml_H = lambda e: pml_H(*self.E, *self.H, self.dt, *psi_H, wait_for=e)

From a6e601b6484406983b4b6b3e36250b9eb3fbe334 Mon Sep 17 00:00:00 2001
From: jan <janp@stanford.edu>
Date: Sat, 16 Jul 2016 17:44:51 -0700
Subject: [PATCH 02/37] Clean up comments

---
 fdtd.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fdtd.py b/fdtd.py
index 0ce560e..e586ee3 100644
--- a/fdtd.py
+++ b/fdtd.py
@@ -96,7 +96,8 @@ def main():
     z_max = 1.6 * a
     xyz_max = numpy.hstack((xy_max, z_max)) + pml_thickness * dx
 
-    # Coordinates of the edges of the cells. The fdtd package can only do square grids at the moment.
+    # Coordinates of the edges of the cells.
+    #  The fdtd package can only do square grids at the moment.
     half_edge_coords = [numpy.arange(dx/2, m + dx, step=dx) for m in xyz_max]
     edge_coords = [numpy.hstack((-h[::-1], h)) for h in half_edge_coords]
 
@@ -139,8 +140,8 @@ def main():
         return numpy.sin(w * t0) * numpy.exp(-alpha * t0**2)
 
     # #### Run a bunch of iterations ####
-    # event = sim.whatever([prev_event]) indicates that sim.whatever should be queued immediately and run
-    #  once prev_event is finished.
+    # event = sim.whatever([prev_event]) indicates that sim.whatever should be queued
+    #  immediately and run once prev_event is finished.
     output_file = h5py.File('simulation_output.h5', 'w')
     start = time.perf_counter()
     for t in range(max_t):

From 6c3313d7c985d87ad45f9eed145af4f91646ed19 Mon Sep 17 00:00:00 2001
From: jan <janp@stanford.edu>
Date: Sat, 16 Jul 2016 18:07:54 -0700
Subject: [PATCH 03/37] call overhead still way too big

---
 fdtd.py      | 9 ++++++---
 fdtd/base.py | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/fdtd.py b/fdtd.py
index e586ee3..8e30fd0 100644
--- a/fdtd.py
+++ b/fdtd.py
@@ -75,7 +75,7 @@ def perturbed_l3(a: float, radius: float, **kwargs) -> Pattern:
 def main():
     max_t = 8000            # number of timesteps
 
-    dx = 40                 # discretization (nm/cell)
+    dx = 15                 # discretization (nm/cell)
     pml_thickness = 8       # (number of cells)
 
     wl = 1550               # Excitation wavelength and fwhm
@@ -109,13 +109,16 @@ def main():
                    eps=n_slab**2)
     mask = perturbed_l3(a, r)
 
-    grid.draw_polygons(surface_normal=gridlock.Direction.z,
+    for i in range(5000):
+        print(i)
+        grid.draw_polygons(surface_normal=gridlock.Direction.z,
                        center=[0, 0, 0],
                        thickness=2 * th,
                        eps=n_air**2,
                        polygons=mask.as_polygons())
+    return
 
-    print(grid.shape)
+    print(grid.shape, numpy.prod(grid.shape))
     # #### Create the simulation grid ####
     sim = Simulation(grid.grids)
 
diff --git a/fdtd/base.py b/fdtd/base.py
index 6285fb9..3717a65 100644
--- a/fdtd/base.py
+++ b/fdtd/base.py
@@ -30,7 +30,8 @@ const int diy = sz;
 const int diz = 1;
 """
 
-# Given a linear index i and shape sx, sy, sz, defines x, y, and z as the 3D indices of the current element (i).
+# Given a linear index i and shape sx, sy, sz, defines x, y, and z
+#  as the 3D indices of the current element (i).
 xyz_source = """
 // Convert linear index to field index (xyz)
 const int x = i / (sz * sy);

From d34c478f1d7122028f9540737de47cbc08b2d3cc Mon Sep 17 00:00:00 2001
From: jan <jan@orange.mpxd.net>
Date: Thu, 1 Sep 2016 14:39:44 -0700
Subject: [PATCH 04/37] Rewrite, with the following features: - Move to jinja2
 templates for the opencl code - Combine PML code into the E, H updates for
 speed - Add Poynting vector calculation code, including precalculation during
 H update - Use arrays for PML parameters (p0, p1) - Switch to linearized,
 C-ordered fields (~50% performance boost??)

- Added jinja2 and fdfd_tools dependencies
---
 fdtd.py                  |  68 ++++++-----
 fdtd/base.py             |  72 -----------
 fdtd/boundary.py         | 185 ----------------------------
 fdtd/kernels/common.cl   |  84 +++++++++++++
 fdtd/kernels/update_e.cl |  78 ++++++++++++
 fdtd/kernels/update_h.cl | 125 +++++++++++++++++++
 fdtd/kernels/update_s.cl |  36 ++++++
 fdtd/simulation.py       | 258 ++++++++++++++++++++++-----------------
 requirements.txt         |   2 +
 9 files changed, 506 insertions(+), 402 deletions(-)
 delete mode 100644 fdtd/base.py
 delete mode 100644 fdtd/boundary.py
 create mode 100644 fdtd/kernels/common.cl
 create mode 100644 fdtd/kernels/update_e.cl
 create mode 100644 fdtd/kernels/update_h.cl
 create mode 100644 fdtd/kernels/update_s.cl

diff --git a/fdtd.py b/fdtd.py
index 0ce560e..ecd16b0 100644
--- a/fdtd.py
+++ b/fdtd.py
@@ -8,12 +8,17 @@ import sys
 import time
 
 import numpy
-import h5py
+import lzma 
+import dill
 
 from fdtd.simulation import Simulation
 from masque import Pattern, shapes
 import gridlock
 import pcgen
+import fdfd_tools
+
+
+__author__ = 'Jan Petykiewicz'
 
 
 def perturbed_l3(a: float, radius: float, **kwargs) -> Pattern:
@@ -75,7 +80,7 @@ def perturbed_l3(a: float, radius: float, **kwargs) -> Pattern:
 def main():
     max_t = 8000            # number of timesteps
 
-    dx = 40                 # discretization (nm/cell)
+    dx = 25                 # discretization (nm/cell)
     pml_thickness = 8       # (number of cells)
 
     wl = 1550               # Excitation wavelength and fwhm
@@ -114,19 +119,9 @@ def main():
                        eps=n_air**2,
                        polygons=mask.as_polygons())
 
-    print(grid.shape)
+    print('grid shape: {}'.format(grid.shape))
     # #### Create the simulation grid ####
-    sim = Simulation(grid.grids)
-
-    # Conducting boundaries and pmls in every direction
-    c_args = []
-    pml_args = []
-    for d in (0, 1, 2):
-        for p in (-1, 1):
-            c_args += [{'direction': d, 'polarity': p}]
-            pml_args += [{'direction': d, 'polarity': p, 'epsilon_eff': n_slab**2}]
-    sim.init_conductors(c_args)
-    sim.init_cpml(pml_args)
+    sim = Simulation(grid.grids, do_poynting=True, pml_thickness=8)
 
     # Source parameters and function
     w = 2 * numpy.pi * dx / wl
@@ -137,31 +132,44 @@ def main():
     def field_source(i):
         t0 = i * sim.dt - delay
         return numpy.sin(w * t0) * numpy.exp(-alpha * t0**2)
+   
+    with open('sources.c', 'w') as f:
+        f.write(sim.sources['E'])
+        f.write('\n==========================================\n')
+        f.write(sim.sources['H'])
+        if sim.update_S:
+            f.write('\n==========================================\n')
+            f.write(sim.sources['S'])
 
     # #### Run a bunch of iterations ####
     # event = sim.whatever([prev_event]) indicates that sim.whatever should be queued immediately and run
     #  once prev_event is finished.
-    output_file = h5py.File('simulation_output.h5', 'w')
     start = time.perf_counter()
     for t in range(max_t):
-        event = sim.cpml_E([])
-        sim.update_E([event]).wait()
+        sim.update_E([]).wait()
 
-        sim.E[1][tuple(grid.shape//2)] += field_source(t)
-        event = sim.conductor_E([])
-        event = sim.cpml_H([event])
-        event = sim.update_H([event])
-        sim.conductor_H([event]).wait()
+        ind = numpy.ravel_multi_index(tuple(grid.shape//2), dims=grid.shape, order='C') + numpy.prod(grid.shape)
+        sim.E[ind] += field_source(t)
+        e = sim.update_H([])
+        if sim.update_S:
+            e = sim.update_S([e])
+        e.wait()
 
-        print('iteration {}: average {} iterations per sec'.format(t, (t+1)/(time.perf_counter()-start)))
-        sys.stdout.flush()
+        if t % 100 == 0:
+            print('iteration {}: average {} iterations per sec'.format(t, (t+1)/(time.perf_counter()-start)))
+            sys.stdout.flush()
 
-        # Save field slices
-        if (t % 20 == 0 and (max_t - t < 1000 or t < 1000)) or t == max_t-1:
-            print('saving E-field')
-            for j, f in enumerate(sim.E):
-                a = f.get()
-                output_file['/E{}_t{}'.format('xyz'[j], t)] = a[:, :, round(a.shape[2]/2)]
+    with lzma.open('saved_simulation', 'wb') as f:
+        def unvec(f):
+            return fdfd_tools.unvec(f, grid.shape)
+        d = {
+            'grid': grid,
+            'E': unvec(sim.E.get()),
+            'H': unvec(sim.H.get()),
+            }
+        if sim.S is not None:
+            d['S'] = unvec(sim.S.get())
+        dill.dump(d, f)
 
 if __name__ == '__main__':
     main()
diff --git a/fdtd/base.py b/fdtd/base.py
deleted file mode 100644
index 6285fb9..0000000
--- a/fdtd/base.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""
-Basic code snippets for opencl FDTD
-"""
-
-from typing import List
-import numpy
-
-
-def shape_source(shape: List[int] or numpy.ndarray) -> str:
-    """
-    Defines sx, sy, sz C constants specifying the shape of the grid in each of the 3 dimensions.
-
-    :param shape: [sx, sy, sz] values.
-    :return: String containing C source.
-    """
-    sxyz = """
-// Field sizes
-const int sx = {shape[0]};
-const int sy = {shape[1]};
-const int sz = {shape[2]};
-""".format(shape=shape)
-    return sxyz
-
-# Defines dix, diy, diz constants used for stepping in the x, y, z directions in a linear array
-#  (ie, given Ex[i] referring to position (x, y, z), Ex[i+diy] will refer to position (x, y+1, z))
-dixyz_source = """
-// Convert offset in field xyz to linear index offset
-const int dix = sz * sy;
-const int diy = sz;
-const int diz = 1;
-"""
-
-# Given a linear index i and shape sx, sy, sz, defines x, y, and z as the 3D indices of the current element (i).
-xyz_source = """
-// Convert linear index to field index (xyz)
-const int x = i / (sz * sy);
-const int y = (i - x * sz * sy) / sz;
-const int z = (i - y * sz - x * sz * sy);
-"""
-
-# Source code for updating the E field; maxes use of dixyz_source.
-maxwell_E_source = """
-// E update equations
-Ex[i] += dt / epsx[i] * ((Hz[i] - Hz[i-diy]) - (Hy[i] - Hy[i-diz]));
-Ey[i] += dt / epsy[i] * ((Hx[i] - Hx[i-diz]) - (Hz[i] - Hz[i-dix]));
-Ez[i] += dt / epsz[i] * ((Hy[i] - Hy[i-dix]) - (Hx[i] - Hx[i-diy]));
-"""
-
-# Source code for updating the H field; maxes use of dixyz_source and assumes mu=0
-maxwell_H_source = """
-// H update equations
-Hx[i] -= dt * ((Ez[i+diy] - Ez[i]) - (Ey[i+diz] - Ey[i]));
-Hy[i] -= dt * ((Ex[i+diz] - Ex[i]) - (Ez[i+dix] - Ez[i]));
-Hz[i] -= dt * ((Ey[i+dix] - Ey[i]) - (Ex[i+diy] - Ex[i]));
-"""
-
-
-def type_to_C(float_type: numpy.float32 or numpy.float64) -> str:
-    """
-    Returns a string corresponding to the C equivalent of a numpy type.
-    Only works for float32 and float64.
-
-    :param float_type: numpy.float32 or numpy.float64
-    :return: string containing the corresponding C type (eg. 'double')
-    """
-    if float_type == numpy.float32:
-        arg_type = 'float'
-    elif float_type == numpy.float64:
-        arg_type = 'double'
-    else:
-        raise Exception('Unsupported type')
-    return arg_type
diff --git a/fdtd/boundary.py b/fdtd/boundary.py
deleted file mode 100644
index bb8dbdf..0000000
--- a/fdtd/boundary.py
+++ /dev/null
@@ -1,185 +0,0 @@
-from typing import List, Dict
-import numpy
-
-
-def conductor(direction: int,
-              polarity: int,
-              ) -> List[str]:
-    """
-    Create source code for conducting boundary conditions.
-
-    :param direction: integer in range(3), corresponding to x,y,z.
-    :param polarity: -1 or 1, specifying eg. a -x or +x boundary.
-    :return: [E_source, H_source] source code for E and H boundary update steps.
-    """
-    if direction not in range(3):
-        raise Exception('Invalid direction: {}'.format(direction))
-
-    if polarity not in (-1, 1):
-        raise Exception('Invalid polarity: {}'.format(polarity))
-
-    r = 'xyz'[direction]
-    uv = 'xyz'.replace(r, '')
-
-    if polarity < 0:
-        bc_e = """
-if ({r} == 0) {{
-    E{r}[i] = 0;
-    E{u}[i] = E{u}[i+di{r}];
-    E{v}[i] = E{v}[i+di{r}];
-}}
-"""
-        bc_h = """
-if ({r} == 0) {{
-    H{r}[i] = H{r}[i+di{r}];
-    H{u}[i] = 0;
-    H{v}[i] = 0;
-}}
-"""
-
-    elif polarity > 0:
-        bc_e = """
-if ({r} == s{r} - 1) {{
-    E{r}[i] = -E{r}[i-2*di{r}];
-    E{u}[i] = +E{u}[i-di{r}];
-    E{v}[i] = +E{v}[i-di{r}];
-}} else if ({r} == s{r} - 2) {{
-    E{r}[i] = 0;
-}}
-"""
-        bc_h = """
-if ({r} == s{r} - 1) {{
-    H{r}[i] = +H{r}[i-di{r}];
-    H{u}[i] = -H{u}[i-2*di{r}];
-    H{v}[i] = -H{v}[i-2*di{r}];
-}} else if ({r} == s{r} - 2) {{
-    H{u}[i] = 0;
-    H{v}[i] = 0;
-}}
-"""
-    else:
-        raise Exception()
-
-    replacements = {'r': r, 'u': uv[0], 'v': uv[1]}
-    return [s.format(**replacements) for s in (bc_e, bc_h)]
-
-
-def cpml(direction: int,
-         polarity: int,
-         dt: float,
-         thickness: int=8,
-         epsilon_eff: float=1,
-         ) -> Dict:
-    """
-    Generate source code for complex phase matched layer (cpml) absorbing boundaries.
-    These are not full boundary conditions and require a conducting boundary to be added
-     in the same direction as the pml.
-
-    :param direction: integer in range(3), corresponding to x, y, z directions.
-    :param polarity: -1 or 1, corresponding to eg. -x or +x direction.
-    :param dt: timestep used by the simulation
-    :param thickness: Number of cells used by the pml (the grid is NOT expanded to add these cells). Default 8.
-    :param epsilon_eff: Effective epsilon_r of the pml layer. Default 1.
-    :return: Dict with entries 'E', 'H' (update equations for E and H) and 'psi_E', 'psi_H' (lists of str,
-            specifying the field names of the cpml fields used in the E and H update steps. Eg.,
-            Psi_xn_Ex for the complex Ex component for the x- pml.)
-    """
-    if direction not in range(3):
-        raise Exception('Invalid direction: {}'.format(direction))
-
-    if polarity not in (-1, 1):
-        raise Exception('Invalid polarity: {}'.format(polarity))
-
-    if thickness <= 2:
-        raise Exception('It would be wise to have a pml with 4+ cells of thickness')
-
-    if epsilon_eff <= 0:
-        raise Exception('epsilon_eff must be positive')
-
-    m = (3.5, 1)
-    sigma_max = 0.8 * (m[0] + 1) / numpy.sqrt(epsilon_eff)
-    alpha_max = 0  # TODO: Decide what to do about non-zero alpha
-    transverse = numpy.delete(range(3), direction)
-
-    r = 'xyz'[direction]
-    np = 'nVp'[numpy.sign(polarity)+1]
-    uv = ['xyz'[i] for i in transverse]
-
-    xe = numpy.arange(1, thickness+1, dtype=float)[::-1]
-    xh = numpy.arange(1, thickness+1, dtype=float)[::-1]
-    if polarity > 0:
-        xe -= 0.5
-    elif polarity < 0:
-        xh -= 0.5
-
-    def par(x):
-        sigma = ((x / thickness) ** m[0]) * sigma_max
-        alpha = ((1 - x / thickness) ** m[1]) * alpha_max
-        p0 = numpy.exp(-(sigma + alpha) * dt)
-        p1 = sigma / (sigma + alpha) * (p0 - 1)
-        return p0, p1
-    p0e, p1e = par(xe)
-    p0h, p1h = par(xh)
-
-    vals = {'r': r,
-            'u': uv[0],
-            'v': uv[1],
-            'np':   np,
-            'th':   thickness,
-            'p0e': ', '.join((str(x) for x in p0e)),
-            'p1e': ', '.join((str(x) for x in p1e)),
-            'p0h': ', '.join((str(x) for x in p0h)),
-            'p1h': ', '.join((str(x) for x in p1h)),
-            'se': '-+'[direction % 2],
-            'sh': '+-'[direction % 2]}
-
-    if polarity < 0:
-        bounds_if = """
-if ( 0 < {r} && {r} < {th} + 1 ) {{
-    const int ir = {r} - 1;                             // index into pml parameters
-    const int ip = {v} + {u} * s{v} + ir * s{v} * s{u};   // linear index into Psi
-"""
-    elif polarity > 0:
-        bounds_if = """
-if ( (s{r} - 1) > {r} && {r} > (s{r} - 1) - ({th} + 1) ) {{
-    const int ir = (s{r} - 1) - ({r} + 1);              // index into pml parameters
-    const int ip = {v} + {u} * s{v} + ir * s{v} * s{u};   // linear index into Psi
-"""
-    else:
-        raise Exception('Bad polarity (=0)')
-
-    code_e = """
-    // pml parameters:
-    const float p0[{th}] = {{ {p0e} }};
-    const float p1[{th}] = {{ {p1e} }};
-
-    Psi_{r}{np}_E{u}[ip] = p0[ir] * Psi_{r}{np}_E{u}[ip] + p1[ir] * (H{v}[i] - H{v}[i-di{r}]);
-    Psi_{r}{np}_E{v}[ip] = p0[ir] * Psi_{r}{np}_E{v}[ip] + p1[ir] * (H{u}[i] - H{u}[i-di{r}]);
-
-    E{u}[i] {se}= dt / eps{u}[i] * Psi_{r}{np}_E{u}[ip];
-    E{v}[i] {sh}= dt / eps{v}[i] * Psi_{r}{np}_E{v}[ip];
-}}
-"""
-    code_h = """
-    // pml parameters:
-    const float p0[{th}] = {{ {p0h} }};
-    const float p1[{th}] = {{ {p1h} }};
-
-    Psi_{r}{np}_H{u}[ip] = p0[ir] * Psi_{r}{np}_H{u}[ip] + p1[ir] * (E{v}[i+di{r}] - E{v}[i]);
-    Psi_{r}{np}_H{v}[ip] = p0[ir] * Psi_{r}{np}_H{v}[ip] + p1[ir] * (E{u}[i+di{r}] - E{u}[i]);
-
-    H{u}[i] {sh}= dt * Psi_{r}{np}_H{u}[ip];
-    H{v}[i] {se}= dt * Psi_{r}{np}_H{v}[ip];
-}}
-"""
-
-    pml_data = {
-        'E': (bounds_if + code_e).format(**vals),
-        'H': (bounds_if + code_h).format(**vals),
-        'psi_E': ['Psi_{r}{np}_E{u}'.format(**vals),
-                  'Psi_{r}{np}_E{v}'.format(**vals)],
-        'psi_H': ['Psi_{r}{np}_H{u}'.format(**vals),
-                  'Psi_{r}{np}_H{v}'.format(**vals)],
-    }
-
-    return pml_data
diff --git a/fdtd/kernels/common.cl b/fdtd/kernels/common.cl
new file mode 100644
index 0000000..d203ca7
--- /dev/null
+++ b/fdtd/kernels/common.cl
@@ -0,0 +1,84 @@
+{#
+/* Common code for E, H updates
+ *
+ * Template parameters:
+ *  ftype	type name (e.g. float or double)
+ *  shape       list of 3 ints specifying shape of fields
+ */
+#}
+
+typedef {{ftype}} ftype;
+
+
+/*
+ * Field size info
+ */
+const size_t sx = {{shape[0]}};
+const size_t sy = {{shape[1]}};
+const size_t sz = {{shape[2]}};
+const size_t field_size = sx * sy * sz;
+
+//Since we use i to index into Ex[], Ey[], ... rather than E[], do nothing if
+// i is outside the bounds of Ex[].
+if (i >= field_size) {
+    PYOPENCL_ELWISE_CONTINUE;
+}
+
+
+/*
+ * Array indexing
+ */
+// Given a linear index i and shape (sx, sy, sz), defines x, y, and z
+//  as the 3D indices of the current element (i).
+// (ie, converts linear index [i] to field indices (x, y, z)
+const size_t x = i / (sz * sy);
+const size_t y = (i - x * sz * sy) / sz;
+const size_t z = (i - y * sz - x * sz * sy);
+
+// Calculate linear index offsets corresponding to offsets in 3D
+// (ie, if E[i] <-> E(x, y, z), then E[i + diy] <-> E(x, y + 1, z)
+const size_t dix = sz * sy;
+const size_t diy = sz;
+const size_t diz = 1;
+
+
+/*
+ * Pointer math
+ */
+//Pointer offsets into the components of a linearized vector-field
+// (eg. Hx = H + XX, where H and Hx are pointers)
+const size_t XX = 0;
+const size_t YY = field_size;
+const size_t ZZ = field_size * 2;
+
+//Define pointers to vector components of each field (eg. Hx = H + XX)
+__global ftype *Ex = E + XX;
+__global ftype *Ey = E + YY;
+__global ftype *Ez = E + ZZ;
+
+__global ftype *Hx = H + XX;
+__global ftype *Hy = H + YY;
+__global ftype *Hz = H + ZZ;
+
+
+/*
+ * Implement periodic boundary conditions
+ *
+ * mx ([m]inus [x]) gives the index offset of the adjacent cell in the minus-x direction.
+ * In the event that we start at x == 0, we actually want to wrap around and grab the cell
+ * x_{-1} == (sx - 1) instead, ie. mx = (sx - 1) * dix .
+ *
+ * px ([p]lus [x]) gives the index offset of the adjacent cell in the plus-x direction.
+ * In the event that we start at x == (sx - 1), we actually want to wrap around and grab
+ * the cell x_{+1} == 0 instead, ie. px = -(sx - 1) * dix .
+ */
+{% for r in 'xyz' %}
+int m{{r}} = -di{{r}};
+int p{{r}} = +di{{r}};
+int wrap_{{r}} = (s{{r}} - 1) * di{{r}};
+if ( {{r}} == 0 ) {
+  m{{r}} = wrap_{{r}};
+} else if ( {{r}} == s{{r}} - 1 ) {
+  p{{r}} = -wrap_{{r}};
+} 
+{% endfor %}
diff --git a/fdtd/kernels/update_e.cl b/fdtd/kernels/update_e.cl
new file mode 100644
index 0000000..2728b46
--- /dev/null
+++ b/fdtd/kernels/update_e.cl
@@ -0,0 +1,78 @@
+/*
+ *  Update E-field, including any PMLs.
+ *  
+ *  Template parameters:
+ *   common_header: Rendered contents of common.cl
+ *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
+ *   pml_thickness: Number of cells (integer)
+ *   
+ *  OpenCL args:
+ *   E, H, dt, eps, [p{01}e{np}, Psi_{xyz}{np}_E]
+ */
+
+{{common_header}}
+
+////////////////////////////////////////////////////////////////////////////
+
+__global ftype *epsx = eps + XX;
+__global ftype *epsy = eps + YY;
+__global ftype *epsz = eps + ZZ;
+
+{% if pmls -%}
+const int pml_thickness = {{pml_thickness}};
+{%- endif %}
+
+/*
+ *   Precalclate derivatives
+ */
+ftype dHxy = Hx[i] - Hx[i + my];
+ftype dHxz = Hx[i] - Hx[i + mz];
+
+ftype dHyx = Hy[i] - Hy[i + mx];
+ftype dHyz = Hy[i] - Hy[i + mz];
+
+ftype dHzx = Hz[i] - Hz[i + mx];
+ftype dHzy = Hz[i] - Hz[i + my];
+
+/*
+ *   PML Update
+ */
+// PML effects on E
+ftype pExi = 0;
+ftype pEyi = 0;
+ftype pEzi = 0;
+
+{% for r, p in pmls -%}
+    {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
+    {%- set psi = 'Psi_' ~ r ~ p ~ '_E' -%}
+    {%- if r != 'y' -%}
+        {%- set se, sh = '-', '+' -%}
+    {%- else -%}
+        {%- set se, sh = '+', '-' -%}
+    {%- endif -%}
+
+    {%- if p == 'n' %}
+
+if ( {{r}} < pml_thickness ) {
+    const size_t ir = {{r}};                          // index into pml parameters
+
+    {%- elif p == 'p' %}
+
+if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_thickness ) {
+    const size_t ir = (s{{r}} - 1) - {{r}};                     // index into pml parameters
+
+    {%- endif %}
+    const size_t ip = {{v}} + {{u}} * s{{v}} + ir * s{{v}} * s{{u}};  // linear index into Psi
+    {{psi ~ u}}[ip] = p0e{{p}}[ir] * {{psi ~ u}}[ip] + p1e{{p}}[ir] * dH{{v ~ r}};
+    {{psi ~ v}}[ip] = p0e{{p}}[ir] * {{psi ~ v}}[ip] + p1e{{p}}[ir] * dH{{u ~ r}};
+    pE{{u}}i {{se}}= {{psi ~ u}}[ip];
+    pE{{v}}i {{sh}}= {{psi ~ v}}[ip];
+}
+{%- endfor %}
+
+/*
+ *  Update E
+ */
+Ex[i] += dt / epsx[i] * (dHzy - dHyz + pExi);
+Ey[i] += dt / epsy[i] * (dHxz - dHzx + pEyi);
+Ez[i] += dt / epsz[i] * (dHyx - dHxy + pEzi);
diff --git a/fdtd/kernels/update_h.cl b/fdtd/kernels/update_h.cl
new file mode 100644
index 0000000..e093cb9
--- /dev/null
+++ b/fdtd/kernels/update_h.cl
@@ -0,0 +1,125 @@
+/*
+ *  Update H-field, including any PMLs.
+ *   Also precalculate values for poynting vector if necessary.
+ *  
+ *  Template parameters:
+ *   common_header: Rendered contents of common.cl
+ *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
+ *   pml_thickness: Number of cells (integer)
+ *   do_poynting: Whether to precalculate poynting vector components (boolean)  
+ *
+ *  OpenCL args:
+ *   E, H, dt, [p{01}h{np}, Psi_{xyz}{np}_H], [oS]
+ */
+
+{{common_header}}
+
+////////////////////////////////////////////////////////////////////////////
+
+{% if pmls -%}
+const int pml_thickness = {{pml_thickness}};
+{%- endif %}
+
+/*
+ *  Precalculate derivatives
+ */
+ftype dExy = Ex[i + py] - Ex[i];
+ftype dExz = Ex[i + pz] - Ex[i];
+
+ftype dEyx = Ey[i + px] - Ey[i];
+ftype dEyz = Ey[i + pz] - Ey[i];
+
+ftype dEzx = Ez[i + px] - Ez[i];
+ftype dEzy = Ez[i + py] - Ez[i];
+
+{%- if do_poynting %}
+
+
+/*
+ *  Precalculate averaged E
+ */
+ftype aExy = Ex[i + py] + Ex[i];
+ftype aExz = Ex[i + pz] + Ex[i];
+
+ftype aEyx = Ey[i + px] + Ey[i];
+ftype aEyz = Ey[i + pz] + Ey[i];
+
+ftype aEzx = Ez[i + px] + Ez[i];
+ftype aEzy = Ez[i + py] + Ez[i];
+{%- endif %}
+
+
+/*
+ *  PML Update
+ */
+// PML contributions to H
+ftype pHxi = 0;
+ftype pHyi = 0;
+ftype pHzi = 0;
+
+{%- for r, p in pmls -%}
+    {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
+    {%- set psi = 'Psi_' ~ r ~ p ~ '_H' -%}
+    {%- if r != 'y' -%}
+        {%- set se, sh = '-', '+' -%}
+    {%- else -%}
+        {%- set se, sh = '+', '-' -%}
+    {%- endif -%}
+
+    {%- if p == 'n' %}
+
+if ( {{r}} < pml_thickness ) {
+    const size_t ir = {{r}};                          // index into pml parameters
+
+    {%- elif p == 'p' %}
+
+if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_thickness ) {
+    const size_t ir = (s{{r}} - 1) - {{r}};                     // index into pml parameters
+
+    {%- endif %}
+    const size_t ip = {{v}} + {{u}} * s{{v}} + ir * s{{v}} * s{{u}};  // linear index into Psi
+    {{psi ~ u}}[ip] = p0h{{p}}[ir] * {{psi ~ u}}[ip] + p1h{{p}}[ir] * dE{{v ~ r}};
+    {{psi ~ v}}[ip] = p0h{{p}}[ir] * {{psi ~ v}}[ip] + p1h{{p}}[ir] * dE{{u ~ r}};
+    pH{{u}}i {{sh}}= {{psi ~ u}}[ip];
+    pH{{v}}i {{se}}= {{psi ~ v}}[ip];
+}
+{%- endfor %}
+
+/*
+ *  Update H
+ */
+{% if do_poynting -%}
+// Save old H for averaging
+ftype Hx_old = Hx[i];
+ftype Hy_old = Hy[i];
+ftype Hz_old = Hz[i];
+{%- endif %}
+
+// H update equations
+Hx[i] -= dt * (dEzy - dEyz + pHxi);
+Hy[i] -= dt * (dExz - dEzx + pHyi);
+Hz[i] -= dt * (dEyx - dExy + pHzi);
+
+{% if do_poynting -%}
+// Average H across timesteps
+ftype aHxt = Hx[i] + Hx_old;
+ftype aHyt = Hy[i] + Hy_old;
+ftype aHzt = Hz[i] + Hz_old;
+
+/*
+ *  Calculate unscaled S components at H locations
+ */
+__global ftype *oSxy = oS + 0 * field_size;
+__global ftype *oSyz = oS + 1 * field_size;
+__global ftype *oSzx = oS + 2 * field_size;
+__global ftype *oSxz = oS + 3 * field_size;
+__global ftype *oSyx = oS + 4 * field_size;
+__global ftype *oSzy = oS + 5 * field_size;
+
+oSxy[i] = aEyx * aHzt;
+oSxz[i] = -aEzx * aHyt;
+oSyz[i] = aEzy * aHxt;
+oSyx[i] = -aExy * aHzt;
+oSzx[i] = aExz * aHyt;
+oSzy[i] = -aEyz * aHxt;
+{%- endif -%}
diff --git a/fdtd/kernels/update_s.cl b/fdtd/kernels/update_s.cl
new file mode 100644
index 0000000..0310411
--- /dev/null
+++ b/fdtd/kernels/update_s.cl
@@ -0,0 +1,36 @@
+/*
+ *  Update E-field, including any PMLs.
+ *  
+ *  Template parameters:
+ *   common_header: Rendered contents of common.cl
+ *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
+ *   pml_thickness: Number of cells (integer)
+ *   
+ *  OpenCL args:
+ *   E, H, dt, S, oS
+ */
+
+{{common_header}}
+
+//////////////////////////////////////////////////////////////////////
+
+
+/*
+ * Calculate S from oS (pre-calculated components)
+ */ 
+__global ftype *Sx = S + XX;
+__global ftype *Sy = S + YY;
+__global ftype *Sz = S + ZZ;
+
+// Use unscaled S components from H locations 
+__global ftype *oSxy = oS + 0 * field_size;
+__global ftype *oSyz = oS + 1 * field_size;
+__global ftype *oSzx = oS + 2 * field_size;
+__global ftype *oSxz = oS + 3 * field_size;
+__global ftype *oSyx = oS + 4 * field_size;
+__global ftype *oSzy = oS + 5 * field_size;
+
+ftype s_factor = dt * 0.125;
+Sx[i] = (oSxy[i] + oSxz[i] + oSxy[i + my] + oSxz[i + mz]) * s_factor;
+Sy[i] = (oSyz[i] + oSyx[i] + oSyz[i + mz] + oSyx[i + mx]) * s_factor;
+Sz[i] = (oSzx[i] + oSzy[i] + oSzx[i + mx] + oSzy[i + my]) * s_factor;
diff --git a/fdtd/simulation.py b/fdtd/simulation.py
index 227dcb0..3dd9f62 100644
--- a/fdtd/simulation.py
+++ b/fdtd/simulation.py
@@ -3,15 +3,23 @@ Class for constructing and holding the basic FDTD operations and fields
 """
 
 from typing import List, Dict, Callable
+from collections import OrderedDict
 import numpy
+import jinja2
 import warnings
 
 import pyopencl
 import pyopencl.array
 from pyopencl.elementwise import ElementwiseKernel
 
-from . import boundary, base
-from .base import type_to_C
+from fdfd_tools import vec
+
+
+__author__ = 'Jan Petykiewicz'
+
+
+# Create jinja2 env on module load
+jinja_env = jinja2.Environment(loader=jinja2.PackageLoader(__name__, 'kernels'))
 
 
 class Simulation(object):
@@ -20,6 +28,7 @@ class Simulation(object):
     """
     E = None    # type: List[pyopencl.array.Array]
     H = None    # type: List[pyopencl.array.Array]
+    S = None    # type: List[pyopencl.array.Array]
     eps = None  # type: List[pyopencl.array.Array]
     dt = None   # type: float
 
@@ -30,24 +39,20 @@ class Simulation(object):
 
     update_E = None     # type: Callable[[],pyopencl.Event]
     update_H = None     # type: Callable[[],pyopencl.Event]
-
-    conductor_E = None  # type: Callable[[],pyopencl.Event]
-    conductor_H = None  # type: Callable[[],pyopencl.Event]
-
-    cpml_E = None       # type: Callable[[],pyopencl.Event]
-    cpml_H = None       # type: Callable[[],pyopencl.Event]
-
-    cpml_psi_E = None   # type: Dict[str, pyopencl.array.Array]
-    cpml_psi_H = None   # type: Dict[str, pyopencl.array.Array]
+    update_S = None     # type: Callable[[],pyopencl.Event]
+    sources = None      # type: Dict[str, str]
 
     def __init__(self,
                  epsilon: List[numpy.ndarray],
-                 dt: float=.99/numpy.sqrt(3),
-                 initial_E: List[numpy.ndarray]=None,
-                 initial_H: List[numpy.ndarray]=None,
-                 context: pyopencl.Context=None,
-                 queue: pyopencl.CommandQueue=None,
-                 float_type: numpy.float32 or numpy.float64=numpy.float32):
+                 dt: float = .99/numpy.sqrt(3),
+                 initial_E: List[numpy.ndarray] = None,
+                 initial_H: List[numpy.ndarray] = None,
+                 context: pyopencl.Context = None,
+                 queue: pyopencl.CommandQueue = None,
+                 float_type: numpy.float32 or numpy.float64 = numpy.float32,
+                 pml_thickness: int = 10,
+                 pmls: List[List[str]] = None,
+                 do_poynting: bool = True):
         """
         Initialize the simulation.
 
@@ -67,7 +72,7 @@ class Simulation(object):
             Exception('All epsilon grids must have the same shape. Shapes are {}', [e.shape for e in epsilon])
 
         if context is None:
-            self.context = pyopencl.create_some_context(False)
+            self.context = pyopencl.create_some_context()
         else:
             self.context = context
 
@@ -84,126 +89,149 @@ class Simulation(object):
             self.dt = dt
 
         self.arg_type = float_type
-
-        self.eps = [pyopencl.array.to_device(self.queue, e.astype(float_type)) for e in epsilon]
+        self.sources = {}
+        self.eps = pyopencl.array.to_device(self.queue, vec(epsilon).astype(float_type))
 
         if initial_E is None:
-            self.E = [pyopencl.array.zeros_like(self.eps[0]) for _ in range(3)]
+            self.E = pyopencl.array.zeros_like(self.eps)
         else:
             if len(initial_E) != 3:
                 Exception('Initial_E must be a list of length 3')
             if not all((E.shape == epsilon[0].shape for E in initial_E)):
                 Exception('Initial_E list elements must have same shape as epsilon elements')
-            self.E = [pyopencl.array.to_device(self.queue, E.astype(float_type)) for E in initial_E]
+            self.E = pyopencl.array.to_device(self.queue, vec(E).astype(float_type))
 
         if initial_H is None:
-            self.H = [pyopencl.array.zeros_like(self.eps[0]) for _ in range(3)]
+            self.H = pyopencl.array.zeros_like(self.eps)
         else:
             if len(initial_H) != 3:
                 Exception('Initial_H must be a list of length 3')
             if not all((H.shape == epsilon[0].shape for H in initial_H)):
                 Exception('Initial_H list elements must have same shape as epsilon elements')
-            self.H = [pyopencl.array.to_device(self.queue, H.astype(float_type)) for H in initial_H]
+            self.H = pyopencl.array.to_device(self.queue, vec(H).astype(float_type))
+
+        if pmls is None:
+            pmls = [[d, p] for d in 'xyz' for p in 'np']
 
         ctype = type_to_C(self.arg_type)
-        E_args = [ctype + ' *E' + c for c in 'xyz']
-        H_args = [ctype + ' *H' + c for c in 'xyz']
-        eps_args = [ctype + ' *eps' + c for c in 'xyz']
-        dt_arg = [ctype + ' dt']
+        def ptr(arg: str) -> str:
+            return ctype + ' *' + arg
 
-        sxyz = base.shape_source(epsilon[0].shape)
-        E_source = sxyz + base.dixyz_source + base.maxwell_E_source
-        H_source = sxyz + base.dixyz_source + base.maxwell_H_source
+        base_fields = OrderedDict()
+        base_fields[ptr('E')] = self.E
+        base_fields[ptr('H')] = self.H
+        base_fields[ctype + ' dt'] = self.dt
 
+        eps_field = OrderedDict()
+        eps_field[ptr('eps')] = self.eps
+
+        common_source = jinja_env.get_template('common.cl').render(
+                ftype=ctype,
+                shape=epsilon[0].shape,
+                )
+        jinja_args = {
+                'common_header': common_source,
+                'pml_thickness': pml_thickness,
+                'pmls': pmls,
+                'do_poynting': do_poynting,
+                }
+        E_source = jinja_env.get_template('update_e.cl').render(**jinja_args)
+        H_source = jinja_env.get_template('update_h.cl').render(**jinja_args)
+
+        self.sources['E'] = E_source
+        self.sources['H'] = H_source
+
+        if do_poynting:
+            S_source = jinja_env.get_template('update_s.cl').render(**jinja_args)
+            self.sources['S'] = S_source
+
+            self.oS = pyopencl.array.zeros(self.queue, self.E.shape + (2,), dtype=float_type)
+            self.S = pyopencl.array.zeros_like(self.E)
+            S_fields = OrderedDict()
+            S_fields[ptr('oS')] = self.oS
+            S_fields[ptr('S')] = self.S
+        else:
+            S_fields = OrderedDict() 
+
+        '''
+        PML
+        '''
+        m = (3.5, 1)
+        sigma_max = 0.8 * (m[0] + 1) / numpy.sqrt(1.0) # TODO: epsilon_eff (not 1.0)
+        alpha_max = 0  # TODO: Decide what to do about non-zero alpha
+
+        def par(x):
+            sigma = ((x / pml_thickness) ** m[0]) * sigma_max
+            alpha = ((1 - x / pml_thickness) ** m[1]) * alpha_max
+            p0 = numpy.exp(-(sigma + alpha) * dt)
+            p1 = sigma / (sigma + alpha) * (p0 - 1)
+            return p0, p1
+        
+        xen, xep, xhn, xhp = (numpy.arange(1, pml_thickness + 1, dtype=float_type)[::-1] for _ in range(4))
+        xep -= 0.5
+        xhn -= 0.5
+
+        pml_p_names = [['p' + a + eh + np for np in 'np' for a in '01'] for eh in 'eh']
+        pml_e_fields = OrderedDict()
+        pml_h_fields = OrderedDict()
+        for ne, nh, pe, ph in zip(*pml_p_names, par(xen) + par(xep), par(xhn) + par(xhp)):
+            pml_e_fields[ptr(ne)] = pyopencl.array.to_device(self.queue, pe)
+            pml_h_fields[ptr(nh)] = pyopencl.array.to_device(self.queue, ph)
+
+        for pml in pmls:
+            uv = 'xyz'.replace(pml[0], '')
+            psi_base = 'Psi_' + ''.join(pml) + '_'
+            psi_names = [[psi_base + eh + c for c in uv] for eh in 'EH']
+
+            psi_shape = list(epsilon[0].shape)
+            psi_shape['xyz'.find(pml[0])] = pml_thickness
+
+            for ne, nh in zip(*psi_names):
+                pml_e_fields[ptr(ne)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)
+                pml_h_fields[ptr(nh)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)
+        
+        self.pml_e_fields = pml_e_fields
+        self.pml_h_fields = pml_h_fields
+
+
+        '''
+        Create operations
+        '''
+        E_args = OrderedDict()
+        [E_args.update(d) for d in (base_fields, eps_field, pml_e_fields)]
         E_update = ElementwiseKernel(self.context, operation=E_source,
-                                     arguments=', '.join(E_args + H_args + dt_arg + eps_args))
+                                     arguments=', '.join(E_args.keys()))
 
+        H_args = OrderedDict()
+        [H_args.update(d) for d in (base_fields, pml_h_fields, S_fields)]
         H_update = ElementwiseKernel(self.context, operation=H_source,
-                                     arguments=', '.join(E_args + H_args + dt_arg))
+                                     arguments=', '.join(H_args.keys()))
+        self.update_E = lambda e: E_update(*E_args.values(), wait_for=e)
+        self.update_H = lambda e: H_update(*H_args.values(), wait_for=e)
 
-        self.update_E = lambda e: E_update(*self.E, *self.H, self.dt, *self.eps, wait_for=e)
-        self.update_H = lambda e: H_update(*self.E, *self.H, self.dt, wait_for=e)
+        if do_poynting:
+            S_args = OrderedDict()
+            [S_args.update(d) for d in (base_fields, S_fields)]
+            S_update = ElementwiseKernel(self.context, operation=S_source,
+                                         arguments=', '.join(S_args.keys()))
 
-    def init_cpml(self, pml_args: List[Dict]):
-        """
-        Initialize absorbing layers (cpml: complex phase matched layer). PMLs are not actual
-         boundary conditions, so you should add a conducting boundary (.init_conductors()) for
-         all directions in which you add PMLs.
-        Allows use of self.cpml_E(events) and self.cpml_H(events).
-        All necessary additional fields are created on the opencl device.
+            self.update_S = lambda e: S_update(*S_args.values(), wait_for=e)
 
-        :param pml_args: A list containing dictionaries which are passed to .boundary.cpml(...).
-            The dt argument is set automatically, but the others must be passed in each entry
-             of pml_args.
-        """
-        sxyz = base.shape_source(self.eps[0].shape)
 
-        # Prepare per-iteration constants for later use
-        pml_E_source = sxyz + base.dixyz_source + base.xyz_source
-        pml_H_source = sxyz + base.dixyz_source + base.xyz_source
+def type_to_C(float_type) -> str:
+    """
+    Returns a string corresponding to the C equivalent of a numpy type.
+    Only works for float16, float32, float64.
 
-        psi_E = []
-        psi_H = []
-        psi_E_names = []
-        psi_H_names = []
-        for arg_set in pml_args:
-            pml_data = boundary.cpml(dt=self.dt, **arg_set)
-
-            pml_E_source += pml_data['E']
-            pml_H_source += pml_data['H']
-
-            ti = numpy.delete(range(3), arg_set['direction'])
-            trans = [self.eps[0].shape[i] for i in ti]
-            psi_shape = (8, trans[0], trans[1])
-
-            psi_E += [pyopencl.array.zeros(self.queue, psi_shape, dtype=self.arg_type)
-                      for _ in pml_data['psi_E']]
-            psi_H += [pyopencl.array.zeros(self.queue, psi_shape, dtype=self.arg_type)
-                      for _ in pml_data['psi_H']]
-
-            psi_E_names += pml_data['psi_E']
-            psi_H_names += pml_data['psi_H']
-
-        ctype = type_to_C(self.arg_type)
-        E_args = [ctype + ' *E' + c for c in 'xyz']
-        H_args = [ctype + ' *H' + c for c in 'xyz']
-        eps_args = [ctype + ' *eps' + c for c in 'xyz']
-        dt_arg = [ctype + ' dt']
-        arglist_E = [ctype + ' *' + psi for psi in psi_E_names]
-        arglist_H = [ctype + ' *' + psi for psi in psi_H_names]
-        pml_E_args = ', '.join(E_args + H_args + dt_arg + eps_args + arglist_E)
-        pml_H_args = ', '.join(E_args + H_args + dt_arg + arglist_H)
-
-        pml_E = ElementwiseKernel(self.context, arguments=pml_E_args, operation=pml_E_source)
-        pml_H = ElementwiseKernel(self.context, arguments=pml_H_args, operation=pml_H_source)
-
-        self.cpml_E = lambda e: pml_E(*self.E, *self.H, self.dt, *self.eps, *psi_E, wait_for=e)
-        self.cpml_H = lambda e: pml_H(*self.E, *self.H, self.dt, *psi_H, wait_for=e)
-        self.cmpl_psi_E = {k: v for k, v in zip(psi_E_names, psi_E)}
-        self.cmpl_psi_H = {k: v for k, v in zip(psi_H_names, psi_H)}
-
-    def init_conductors(self, conductor_args: List[Dict]):
-        """
-        Initialize reflecting boundary conditions.
-        Allows use of self.conductor_E(events) and self.conductor_H(events).
-
-        :param conductor_args: List of dictionaries with which to call .boundary.conductor(...).
-        """
-
-        sxyz = base.shape_source(self.eps[0].shape)
-
-        # Prepare per-iteration constants
-        bc_E_source = sxyz + base.dixyz_source + base.xyz_source
-        bc_H_source = sxyz + base.dixyz_source + base.xyz_source
-        for arg_set in conductor_args:
-            [e, h] = boundary.conductor(**arg_set)
-            bc_E_source += e
-            bc_H_source += h
-
-        E_args = [type_to_C(self.arg_type) + ' *E' + c for c in 'xyz']
-        H_args = [type_to_C(self.arg_type) + ' *H' + c for c in 'xyz']
-        bc_E = ElementwiseKernel(self.context, arguments=E_args, operation=bc_E_source)
-        bc_H = ElementwiseKernel(self.context, arguments=H_args, operation=bc_H_source)
-
-        self.conductor_E = lambda e: bc_E(*self.E, wait_for=e)
-        self.conductor_H = lambda e: bc_H(*self.H, wait_for=e)
+    :param float_type: e.g. numpy.float32
+    :return: string containing the corresponding C type (eg. 'double')
+    """
+    if float_type == numpy.float16:
+        arg_type = 'half'
+    elif float_type == numpy.float32:
+        arg_type = 'float'
+    elif float_type == numpy.float64:
+        arg_type = 'double'
+    else:
+        raise Exception('Unsupported type')
+    return arg_type
diff --git a/requirements.txt b/requirements.txt
index 6121670..6ed651b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
 numpy
 h5py
 pyopencl
+jinja2
 -e git+https://mpxd.net/gogs/jan/float_raster.git@release#egg=float_raster
 -e git+https://mpxd.net/gogs/jan/gridlock.git@release#egg=gridlock
 -e git+https://mpxd.net/gogs/jan/masque.git@release#egg=masque
+-e git+https://mpxd.net/gogs/jan/fdfd_tools.git@master#egg=fdfd_tools

From c1750e9fde087ad26ecad5434380bbd1f7e7be28 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Wed, 29 Mar 2017 01:09:21 -0700
Subject: [PATCH 05/37] Update readme

---
 README.md | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 5da2b6b..fbc972b 100644
--- a/README.md
+++ b/README.md
@@ -5,28 +5,32 @@ electromagnetic simulations on parallel compute hardware (mainly GPUs).
 
 **Performance** highly depends on what hardware you have available:
 * A 395x345x73 cell simulation (~10 million points, 8-cell absorbing boundaries)
- runs at around 42 iterations/sec. on my Nvidia GTX 580.
-* On my laptop (Nvidia 940M) the same simulation achieves ~8 iterations/sec.
+ runs at around 91 iterations/sec. on my AMD RX480.
+* On an Nvidia GTX 580, it runs at 66 iterations/sec
+* On my laptop (Nvidia 940M) the same simulation achieves ~12 iterations/sec.
 * An L3 photonic crystal cavity ringdown simulation (1550nm source, 40nm
- discretization, 8000 steps) takes about 5 minutes on my laptop.
+ discretization, 8000 steps) takes about 3 minutes on my laptop.
 
 **Capabilities** are currently pretty minimal:
 * Absorbing boundaries (CPML)
-* Conducting boundaries (PMC)
+* Perfect electrical conductors (PECs; to use set epsilon to inf)
 * Anisotropic media (eps_xx, eps_yy, eps_zz, mu_xx, ...)
 * Direct access to fields (eg., you can trivially add a soft or hard
- current source with just sim.E[1] += sin(f0 * t), or save any portion
+ current source with just sim.E[ind] += sin(f0 * t), or save any portion
  of a field to a file)
 
+
 ## Installation
 
 **Requirements:**
 * python 3 (written and tested with 3.5)
 * numpy
 * pyopencl
-* h5py (for file output)
+* jinja2
+* dill (for file output)
 * [gridlock](https://mpxd.net/gogs/jan/gridlock)
 * [masque](https://mpxd.net/gogs/jan/masque)
+* [fdfd_tools](https://mpxd.net/gogs/jan/fdfd_tools)
 
 To get the code, just clone this repository:
 ```bash

From 97d9901e4b0356d4a8ab77d65013e7f5b73bc009 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Sat, 12 Aug 2017 19:20:29 -0700
Subject: [PATCH 06/37] Use logging package for output

---
 fdtd.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fdtd.py b/fdtd.py
index bbdabec..961bc0e 100644
--- a/fdtd.py
+++ b/fdtd.py
@@ -6,6 +6,7 @@ See main() for simulation setup.
 
 import sys
 import time
+import logging
 
 import numpy
 import lzma 
@@ -20,6 +21,9 @@ import fdfd_tools
 
 __author__ = 'Jan Petykiewicz'
 
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
 
 def perturbed_l3(a: float, radius: float, **kwargs) -> Pattern:
     """
@@ -120,7 +124,7 @@ def main():
                        eps=n_air**2,
                        polygons=mask.as_polygons())
 
-    print('grid shape: {}'.format(grid.shape))
+    logger.info('grid shape: {}'.format(grid.shape))
     # #### Create the simulation grid ####
     sim = Simulation(grid.grids, do_poynting=True, pml_thickness=8)
 
@@ -157,7 +161,7 @@ def main():
         e.wait()
 
         if t % 100 == 0:
-            print('iteration {}: average {} iterations per sec'.format(t, (t+1)/(time.perf_counter()-start)))
+            logger.info('iteration {}: average {} iterations per sec'.format(t, (t+1)/(time.perf_counter()-start)))
             sys.stdout.flush()
 
     with lzma.open('saved_simulation', 'wb') as f:

From 0d91f0d43efba28e91dc1a88b5f3acab9294b799 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 14 Aug 2017 15:41:20 -0700
Subject: [PATCH 07/37] rename lib

---
 fdtd.py                                   | 7 ++++---
 fdtd/__init__.py                          | 0
 opencl_fdtd/__init__.py                   | 1 +
 {fdtd => opencl_fdtd}/kernels/common.cl   | 0
 {fdtd => opencl_fdtd}/kernels/update_e.cl | 0
 {fdtd => opencl_fdtd}/kernels/update_h.cl | 0
 {fdtd => opencl_fdtd}/kernels/update_s.cl | 0
 {fdtd => opencl_fdtd}/simulation.py       | 6 +++---
 8 files changed, 8 insertions(+), 6 deletions(-)
 delete mode 100644 fdtd/__init__.py
 create mode 100644 opencl_fdtd/__init__.py
 rename {fdtd => opencl_fdtd}/kernels/common.cl (100%)
 rename {fdtd => opencl_fdtd}/kernels/update_e.cl (100%)
 rename {fdtd => opencl_fdtd}/kernels/update_h.cl (100%)
 rename {fdtd => opencl_fdtd}/kernels/update_s.cl (100%)
 rename {fdtd => opencl_fdtd}/simulation.py (99%)

diff --git a/fdtd.py b/fdtd.py
index bbdabec..13af0fd 100644
--- a/fdtd.py
+++ b/fdtd.py
@@ -8,10 +8,10 @@ import sys
 import time
 
 import numpy
-import lzma 
+import lzma
 import dill
 
-from fdtd.simulation import Simulation
+from opencl_fdtd import Simulation
 from masque import Pattern, shapes
 import gridlock
 import pcgen
@@ -133,7 +133,7 @@ def main():
     def field_source(i):
         t0 = i * sim.dt - delay
         return numpy.sin(w * t0) * numpy.exp(-alpha * t0**2)
-   
+
     with open('sources.c', 'w') as f:
         f.write(sim.sources['E'])
         f.write('\n==========================================\n')
@@ -172,5 +172,6 @@ def main():
             d['S'] = unvec(sim.S.get())
         dill.dump(d, f)
 
+
 if __name__ == '__main__':
     main()
diff --git a/fdtd/__init__.py b/fdtd/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/opencl_fdtd/__init__.py b/opencl_fdtd/__init__.py
new file mode 100644
index 0000000..e3e95b4
--- /dev/null
+++ b/opencl_fdtd/__init__.py
@@ -0,0 +1 @@
+from .simulation import Simulation, type_to_C
diff --git a/fdtd/kernels/common.cl b/opencl_fdtd/kernels/common.cl
similarity index 100%
rename from fdtd/kernels/common.cl
rename to opencl_fdtd/kernels/common.cl
diff --git a/fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
similarity index 100%
rename from fdtd/kernels/update_e.cl
rename to opencl_fdtd/kernels/update_e.cl
diff --git a/fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
similarity index 100%
rename from fdtd/kernels/update_h.cl
rename to opencl_fdtd/kernels/update_h.cl
diff --git a/fdtd/kernels/update_s.cl b/opencl_fdtd/kernels/update_s.cl
similarity index 100%
rename from fdtd/kernels/update_s.cl
rename to opencl_fdtd/kernels/update_s.cl
diff --git a/fdtd/simulation.py b/opencl_fdtd/simulation.py
similarity index 99%
rename from fdtd/simulation.py
rename to opencl_fdtd/simulation.py
index f1a8dec..77a95df 100644
--- a/fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -152,7 +152,7 @@ class Simulation(object):
             S_fields[ptr('oS')] = self.oS
             S_fields[ptr('S')] = self.S
         else:
-            S_fields = OrderedDict() 
+            S_fields = OrderedDict()
 
         '''
         PML
@@ -167,7 +167,7 @@ class Simulation(object):
             p0 = numpy.exp(-(sigma + alpha) * dt)
             p1 = sigma / (sigma + alpha) * (p0 - 1)
             return p0, p1
-        
+
         xen, xep, xhn, xhp = (numpy.arange(1, pml_thickness + 1, dtype=float_type)[::-1] for _ in range(4))
         xep -= 0.5
         xhn -= 0.5
@@ -190,7 +190,7 @@ class Simulation(object):
             for ne, nh in zip(*psi_names):
                 pml_e_fields[ptr(ne)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)
                 pml_h_fields[ptr(nh)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)
-        
+
         self.pml_e_fields = pml_e_fields
         self.pml_h_fields = pml_h_fields
 

From a85f5477490f8f175dc42e2e61a67026b4c9063a Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Thu, 24 Aug 2017 11:28:03 -0700
Subject: [PATCH 08/37] doc updates

---
 README.md                       |  4 ++-
 opencl_fdtd/kernels/update_e.cl |  4 +--
 opencl_fdtd/kernels/update_h.cl |  4 +--
 opencl_fdtd/kernels/update_s.cl |  8 ++---
 opencl_fdtd/simulation.py       | 52 ++++++++++++++++++++++++++++++---
 5 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index fbc972b..26d5acd 100644
--- a/README.md
+++ b/README.md
@@ -27,10 +27,12 @@ electromagnetic simulations on parallel compute hardware (mainly GPUs).
 * numpy
 * pyopencl
 * jinja2
+* [fdfd_tools](https://mpxd.net/gogs/jan/fdfd_tools)
+
+Optional (used for examples):
 * dill (for file output)
 * [gridlock](https://mpxd.net/gogs/jan/gridlock)
 * [masque](https://mpxd.net/gogs/jan/masque)
-* [fdfd_tools](https://mpxd.net/gogs/jan/fdfd_tools)
 
 To get the code, just clone this repository:
 ```bash
diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index 2728b46..4bda993 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -1,11 +1,11 @@
 /*
  *  Update E-field, including any PMLs.
- *  
+ *
  *  Template parameters:
  *   common_header: Rendered contents of common.cl
  *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
  *   pml_thickness: Number of cells (integer)
- *   
+ *
  *  OpenCL args:
  *   E, H, dt, eps, [p{01}e{np}, Psi_{xyz}{np}_E]
  */
diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index e093cb9..8519b51 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -1,12 +1,12 @@
 /*
  *  Update H-field, including any PMLs.
  *   Also precalculate values for poynting vector if necessary.
- *  
+ *
  *  Template parameters:
  *   common_header: Rendered contents of common.cl
  *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
  *   pml_thickness: Number of cells (integer)
- *   do_poynting: Whether to precalculate poynting vector components (boolean)  
+ *   do_poynting: Whether to precalculate poynting vector components (boolean)
  *
  *  OpenCL args:
  *   E, H, dt, [p{01}h{np}, Psi_{xyz}{np}_H], [oS]
diff --git a/opencl_fdtd/kernels/update_s.cl b/opencl_fdtd/kernels/update_s.cl
index 0310411..94e061e 100644
--- a/opencl_fdtd/kernels/update_s.cl
+++ b/opencl_fdtd/kernels/update_s.cl
@@ -1,11 +1,11 @@
 /*
  *  Update E-field, including any PMLs.
- *  
+ *
  *  Template parameters:
  *   common_header: Rendered contents of common.cl
  *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
  *   pml_thickness: Number of cells (integer)
- *   
+ *
  *  OpenCL args:
  *   E, H, dt, S, oS
  */
@@ -17,12 +17,12 @@
 
 /*
  * Calculate S from oS (pre-calculated components)
- */ 
+ */
 __global ftype *Sx = S + XX;
 __global ftype *Sy = S + YY;
 __global ftype *Sz = S + ZZ;
 
-// Use unscaled S components from H locations 
+// Use unscaled S components from H locations
 __global ftype *oSxy = oS + 0 * field_size;
 __global ftype *oSyz = oS + 1 * field_size;
 __global ftype *oSzx = oS + 2 * field_size;
diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 77a95df..7a49f39 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -25,6 +25,35 @@ jinja_env = jinja2.Environment(loader=jinja2.PackageLoader(__name__, 'kernels'))
 class Simulation(object):
     """
     Constructs and holds the basic FDTD operations and related fields
+
+    After constructing this object, call the (update_E, update_H, update_S) members
+     to perform FDTD updates on the stored (E, H, S) fields:
+
+        sim = Simulation(grid.grids, do_poynting=True, pml_thickness=8)
+        with open('sources.c', 'w') as f:
+            f.write('{}'.format(sim.sources))
+
+        for t in range(max_t):
+            sim.update_E([]).wait()
+
+            # Find the linear index for the center point, for Ey
+            ind = numpy.ravel_multi_index(tuple(grid.shape//2), dims=grid.shape, order='C') + \
+                    numpy.prod(grid.shape) * 1
+            # Perturb the field (i.e., add a soft current source)
+            sim.E[ind] += numpy.sin(omega * t * sim.dt)
+            event = sim.update_H([])
+            if sim.update_S:
+                event = sim.update_S([event])
+            event.wait()
+
+            with lzma.open('saved_simulation', 'wb') as f:
+                dill.dump(fdfd_tools.unvec(sim.E.get(), grid.shape), f)
+
+    Code in the form
+        event2 = sim.update_H([event0, event1])
+     indicates that the update_H operation should be prepared immediately, but wait for
+     event0 and event1 to occur (i.e. previous operations to finish) before starting execution.
+     event2 can then be used to prepare further operations to be run after update_H.
     """
     E = None    # type: List[pyopencl.array.Array]
     H = None    # type: List[pyopencl.array.Array]
@@ -37,9 +66,9 @@ class Simulation(object):
     context = None      # type: pyopencl.Context
     queue = None        # type: pyopencl.CommandQueue
 
-    update_E = None     # type: Callable[[],pyopencl.Event]
-    update_H = None     # type: Callable[[],pyopencl.Event]
-    update_S = None     # type: Callable[[],pyopencl.Event]
+    update_E = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
+    update_H = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
+    update_S = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
     sources = None      # type: Dict[str, str]
 
     def __init__(self,
@@ -50,8 +79,8 @@ class Simulation(object):
                  context: pyopencl.Context = None,
                  queue: pyopencl.CommandQueue = None,
                  float_type: numpy.float32 or numpy.float64 = numpy.float32,
-                 pml_thickness: int = 10,
                  pmls: List[List[str]] = None,
+                 pml_thickness: int = 10,
                  do_poynting: bool = True):
         """
         Initialize the simulation.
@@ -64,6 +93,21 @@ class Simulation(object):
         :param context: pyOpenCL context. If not given, pyopencl.create_some_context(False) is called.
         :param queue: pyOpenCL command queue. If not given, pyopencl.CommandQueue(context) is called.
         :param float_type: numpy.float32 or numpy.float64. Default numpy.float32.
+        :param pmls: List of [axis, direction] pairs which specify simluation boundaries to be
+                'coated' with a PML (absorbing layer). Axis should be one of 'x', 'y', 'z', and
+                direction should be one of 'n', 'p' (i.e., negative, positive).
+                Default is to apply PMLs to all six boundaries.
+        :param pml_thickness: Thickness of any PMLs, in number of grid cells. Default 10.
+        :param do_poynting: If true, enables calculation of the poynting vector, S.
+                Poynting vector calculation adds the following computational burdens:
+                    * During update_H, ~6 extra additions/cell are performed in order to spatially
+                        average E and temporally average H. These quantities are multiplied
+                        (6 multiplications/cell) and then stored (6 writes/cell, cache-friendly).
+                    * update_S performs a discrete cross product using the precalculated products
+                        from update_H. This is not nice to the cache and similar to e.g. update_E
+                        in complexity.
+                    * GPU memory requirements are approximately doubled, since S and the intermediate
+                        products must be stored.
         """
 
         if len(epsilon) != 3:

From d02cd18403e8c07b8c3de7a10b335fec180a148a Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Sun, 1 Oct 2017 12:34:11 -0700
Subject: [PATCH 09/37] improve pml specification

---
 fdtd.py                         |  4 +-
 opencl_fdtd/kernels/update_e.cl | 17 +++----
 opencl_fdtd/kernels/update_h.cl | 20 ++++----
 opencl_fdtd/simulation.py       | 82 +++++++++++++++++++--------------
 4 files changed, 69 insertions(+), 54 deletions(-)

diff --git a/fdtd.py b/fdtd.py
index 5d0f4c3..08606c6 100644
--- a/fdtd.py
+++ b/fdtd.py
@@ -126,7 +126,9 @@ def main():
 
     logger.info('grid shape: {}'.format(grid.shape))
     # #### Create the simulation grid ####
-    sim = Simulation(grid.grids, do_poynting=True, pml_thickness=8)
+    pmls = [{'axis': a, 'polarity': p, 'thickness': pml_thickness}
+            for a in 'xyz' for p in 'np']
+    sim = Simulation(grid.grids, do_poynting=True, pmls=pmls)
 
     # Source parameters and function
     w = 2 * numpy.pi * dx / wl
diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index 4bda993..f7304e7 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -3,8 +3,8 @@
  *
  *  Template parameters:
  *   common_header: Rendered contents of common.cl
- *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
- *   pml_thickness: Number of cells (integer)
+ *   pmls: [{'axis': 'x', 'polarity': 'n', 'thickness': 8}, ...] list of pml dicts containing
+        axes, polarities, and thicknesses.
  *
  *  OpenCL args:
  *   E, H, dt, eps, [p{01}e{np}, Psi_{xyz}{np}_E]
@@ -18,9 +18,6 @@ __global ftype *epsx = eps + XX;
 __global ftype *epsy = eps + YY;
 __global ftype *epsz = eps + ZZ;
 
-{% if pmls -%}
-const int pml_thickness = {{pml_thickness}};
-{%- endif %}
 
 /*
  *   Precalclate derivatives
@@ -42,7 +39,9 @@ ftype pExi = 0;
 ftype pEyi = 0;
 ftype pEzi = 0;
 
-{% for r, p in pmls -%}
+{% for pml in pmls -%}
+    {%- set r = pml['axis'] -%}
+    {%- set p = pml['polarity'] -%}
     {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
     {%- set psi = 'Psi_' ~ r ~ p ~ '_E' -%}
     {%- if r != 'y' -%}
@@ -51,14 +50,16 @@ ftype pEzi = 0;
         {%- set se, sh = '+', '-' -%}
     {%- endif -%}
 
+pml_{{r ~ p}}_thickness = {{pml['thickness']}};
+
     {%- if p == 'n' %}
 
-if ( {{r}} < pml_thickness ) {
+if ( {{r}} < pml_{{r ~ p}_thickness ) {
     const size_t ir = {{r}};                          // index into pml parameters
 
     {%- elif p == 'p' %}
 
-if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_thickness ) {
+if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}_thickness ) {
     const size_t ir = (s{{r}} - 1) - {{r}};                     // index into pml parameters
 
     {%- endif %}
diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index 8519b51..7c648b6 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -4,22 +4,18 @@
  *
  *  Template parameters:
  *   common_header: Rendered contents of common.cl
- *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
- *   pml_thickness: Number of cells (integer)
+ *   pmls: [{'axis': 'x', 'polarity': 'n', 'thickness': 8}, ...] list of pml dicts containing
+        axes, polarities, and thicknesses.
  *   do_poynting: Whether to precalculate poynting vector components (boolean)
  *
  *  OpenCL args:
- *   E, H, dt, [p{01}h{np}, Psi_{xyz}{np}_H], [oS]
+ *   E, H, dt, [p{xyz}{01}h{np}, Psi_{xyz}{np}_H], [oS]
  */
 
 {{common_header}}
 
 ////////////////////////////////////////////////////////////////////////////
 
-{% if pmls -%}
-const int pml_thickness = {{pml_thickness}};
-{%- endif %}
-
 /*
  *  Precalculate derivatives
  */
@@ -57,7 +53,9 @@ ftype pHxi = 0;
 ftype pHyi = 0;
 ftype pHzi = 0;
 
-{%- for r, p in pmls -%}
+{% for pml in pmls -%}
+    {%- set r = pml['axis'] -%}
+    {%- set p = pml['polarity'] -%}
     {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
     {%- set psi = 'Psi_' ~ r ~ p ~ '_H' -%}
     {%- if r != 'y' -%}
@@ -66,14 +64,16 @@ ftype pHzi = 0;
         {%- set se, sh = '+', '-' -%}
     {%- endif -%}
 
+pml_{{r ~ p}}_thickness = {{pml['thickness']}};
+
     {%- if p == 'n' %}
 
-if ( {{r}} < pml_thickness ) {
+if ( {{r}} < pml_{{r ~ p}}_thickness ) {
     const size_t ir = {{r}};                          // index into pml parameters
 
     {%- elif p == 'p' %}
 
-if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_thickness ) {
+if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}}_thickness ) {
     const size_t ir = (s{{r}} - 1) - {{r}};                     // index into pml parameters
 
     {%- endif %}
diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 7a49f39..78460ec 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -29,7 +29,8 @@ class Simulation(object):
     After constructing this object, call the (update_E, update_H, update_S) members
      to perform FDTD updates on the stored (E, H, S) fields:
 
-        sim = Simulation(grid.grids, do_poynting=True, pml_thickness=8)
+        pmls = [{'axis': a, 'polarity': p} for a in 'xyz' for p in 'np']
+        sim = Simulation(grid.grids, do_poynting=True, pmls=pmls)
         with open('sources.c', 'w') as f:
             f.write('{}'.format(sim.sources))
 
@@ -73,31 +74,34 @@ class Simulation(object):
 
     def __init__(self,
                  epsilon: List[numpy.ndarray],
+                 pmls: List[Dict[str, int or float]],
                  dt: float = .99/numpy.sqrt(3),
                  initial_E: List[numpy.ndarray] = None,
                  initial_H: List[numpy.ndarray] = None,
                  context: pyopencl.Context = None,
                  queue: pyopencl.CommandQueue = None,
                  float_type: numpy.float32 or numpy.float64 = numpy.float32,
-                 pmls: List[List[str]] = None,
-                 pml_thickness: int = 10,
                  do_poynting: bool = True):
         """
         Initialize the simulation.
 
         :param epsilon: List containing [eps_r,xx, eps_r,yy, eps_r,zz], where each element is a Yee-shifted ndarray
                 spanning the simulation domain. Relative epsilon is used.
-        :param dt: Time step. Default is the Courant factor.
+        :param pmls: List of dicts with keys:
+            'axis': One of 'x', 'y', 'z'.
+            'direction': One of 'n', 'p'.
+            'thickness': Number of layers, default 8.
+            'epsilon_eff': Effective epsilon to match to. Default 1.0.
+            'mu_eff': Effective mu to match to. Default 1.0.
+            'ln_R_per_layer': Desired (ln(R) / thickness) value. Default -1.6.
+            'm': Polynomial grading exponent. Default 3.5.
+            'ma': Exponent for alpha. Default 1.
+        :param dt: Time step. Default is .99/sqrt(3).
         :param initial_E: Initial E-field (default is 0 everywhere). Same format as epsilon.
         :param initial_H: Initial H-field (default is 0 everywhere). Same format as epsilon.
         :param context: pyOpenCL context. If not given, pyopencl.create_some_context(False) is called.
         :param queue: pyOpenCL command queue. If not given, pyopencl.CommandQueue(context) is called.
         :param float_type: numpy.float32 or numpy.float64. Default numpy.float32.
-        :param pmls: List of [axis, direction] pairs which specify simluation boundaries to be
-                'coated' with a PML (absorbing layer). Axis should be one of 'x', 'y', 'z', and
-                direction should be one of 'n', 'p' (i.e., negative, positive).
-                Default is to apply PMLs to all six boundaries.
-        :param pml_thickness: Thickness of any PMLs, in number of grid cells. Default 10.
         :param do_poynting: If true, enables calculation of the poynting vector, S.
                 Poynting vector calculation adds the following computational burdens:
                     * During update_H, ~6 extra additions/cell are performed in order to spatially
@@ -154,8 +158,13 @@ class Simulation(object):
                 Exception('Initial_H list elements must have same shape as epsilon elements')
             self.H = pyopencl.array.to_device(self.queue, vec(H).astype(float_type))
 
-        if pmls is None:
-            pmls = [[d, p] for d in 'xyz' for p in 'np']
+        for pml in pmls:
+            pml.setdefault('thickness', 8)
+            pml.setdefault('epsilon_eff', 1.0)
+            pml.setdefault('mu_eff', 1.0)
+            pml.setdefault('ln_R_per_layer', -1.6)
+            pml.setdefault('m', 3.5)
+            pml.setdefault('ma', 1)
 
         ctype = type_to_C(self.arg_type)
 
@@ -176,7 +185,6 @@ class Simulation(object):
                 )
         jinja_args = {
                 'common_header': common_source,
-                'pml_thickness': pml_thickness,
                 'pmls': pmls,
                 'do_poynting': do_poynting,
                 }
@@ -201,35 +209,39 @@ class Simulation(object):
         '''
         PML
         '''
-        m = (3.5, 1)
-        sigma_max = 0.8 * (m[0] + 1) / numpy.sqrt(1.0) # TODO: epsilon_eff (not 1.0)
-        alpha_max = 0  # TODO: Decide what to do about non-zero alpha
-
-        def par(x):
-            sigma = ((x / pml_thickness) ** m[0]) * sigma_max
-            alpha = ((1 - x / pml_thickness) ** m[1]) * alpha_max
-            p0 = numpy.exp(-(sigma + alpha) * dt)
-            p1 = sigma / (sigma + alpha) * (p0 - 1)
-            return p0, p1
-
-        xen, xep, xhn, xhp = (numpy.arange(1, pml_thickness + 1, dtype=float_type)[::-1] for _ in range(4))
-        xep -= 0.5
-        xhn -= 0.5
-
-        pml_p_names = [['p' + a + eh + np for np in 'np' for a in '01'] for eh in 'eh']
         pml_e_fields = OrderedDict()
         pml_h_fields = OrderedDict()
-        for ne, nh, pe, ph in zip(*pml_p_names, par(xen) + par(xep), par(xhn) + par(xhp)):
-            pml_e_fields[ptr(ne)] = pyopencl.array.to_device(self.queue, pe)
-            pml_h_fields[ptr(nh)] = pyopencl.array.to_device(self.queue, ph)
-
         for pml in pmls:
-            uv = 'xyz'.replace(pml[0], '')
-            psi_base = 'Psi_' + ''.join(pml) + '_'
+            a = 'xyz'.find(pml['axis'])
+
+            sigma_max = -pml['ln_R_per_layer'] / 2 * (pml['m'] + 1) / \
+                    numpy.sqrt(pml['epsilon_eff'] * pml['mu_eff'])
+            alpha_max = 0           # TODO: Nonzero alpha
+
+            def par(x):
+                sigma = ((x / pml['thickness']) ** pml['m']) * sigma_max
+                alpha = ((1 - x / pml['thickness']) ** pml['ma']) * alpha_max
+                p0 = numpy.exp(-(sigma + alpha) * dt)
+                p1 = sigma / (sigma + alpha) * (p0 - 1)
+                return p0, p1
+
+            xe, xh = (numpy.arange(1, pml['thickness'] + 1, dtype=float_type)[::-1] for _ in range(2))
+            if pml['polarity'] == 'p':
+                xe -= 0.5
+            elif pml['polarity'] == 'n':
+                xh -= 0.5
+
+            pml_p_names = [['p' + pml['axis'] + i + eh + pml['polarity'] for i in '01'] for eh in 'eh']
+            for name_e, name_h, pe, ph in zip(pml_p_names[0], pml_p_names[1], par(xe), par(xh)):
+                pml_e_fields[ptr(name_e)] = pyopencl.array.to_device(self.queue, pe)
+                pml_h_fields[ptr(name_h)] = pyopencl.array.to_device(self.queue, ph)
+
+            uv = 'xyz'.replace(pml['axis'], '')
+            psi_base = 'Psi_' + pml['axis'] + pml['polarity'] + '_'
             psi_names = [[psi_base + eh + c for c in uv] for eh in 'EH']
 
             psi_shape = list(epsilon[0].shape)
-            psi_shape['xyz'.find(pml[0])] = pml_thickness
+            psi_shape[a] = pml['thickness']
 
             for ne, nh in zip(*psi_names):
                 pml_e_fields[ptr(ne)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)

From 6bd756c3d3ab8025ae3f317cc9e8e243987bf7a4 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Sun, 1 Oct 2017 12:37:56 -0700
Subject: [PATCH 10/37] add setup.py

---
 setup.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 setup.py

diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a2f9f9d
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+from setuptools import setup, find_packages
+
+setup(name='opencl_fdtd',
+      version='0.4',
+      description='OpenCL FDTD solver',
+      author='Jan Petykiewicz',
+      author_email='anewusername@gmail.com',
+      url='https://mpxd.net/gogs/jan/opencl_fdtd',
+      packages=find_packages(),
+      package_data={
+          'opencl_fdfd': ['kernels/*']
+      },
+      install_requires=[
+            'numpy',
+            'pyopencl',
+            'jinja2',
+            'fdfd_tools>=0.3',
+      ],
+      extras_require={
+      },
+      )
+

From 9772f47a3482eb7a57872c0219d85753db8822ef Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Sun, 1 Oct 2017 14:01:22 -0700
Subject: [PATCH 11/37] fix typo

---
 opencl_fdtd/kernels/update_e.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index f7304e7..9162b21 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -54,12 +54,12 @@ pml_{{r ~ p}}_thickness = {{pml['thickness']}};
 
     {%- if p == 'n' %}
 
-if ( {{r}} < pml_{{r ~ p}_thickness ) {
+if ( {{r}} < pml_{{r ~ p}}_thickness ) {
     const size_t ir = {{r}};                          // index into pml parameters
 
     {%- elif p == 'p' %}
 
-if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}_thickness ) {
+if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}}_thickness ) {
     const size_t ir = (s{{r}} - 1) - {{r}};                     // index into pml parameters
 
     {%- endif %}

From 6a03977a699a1cf5035e48f10d9b02496b0978f5 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Sun, 1 Oct 2017 13:11:19 -0700
Subject: [PATCH 12/37] fix pml param names

---
 opencl_fdtd/kernels/update_e.cl | 4 ++--
 opencl_fdtd/kernels/update_h.cl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index 9162b21..63a0c6f 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -64,8 +64,8 @@ if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}}_thickness ) {
 
     {%- endif %}
     const size_t ip = {{v}} + {{u}} * s{{v}} + ir * s{{v}} * s{{u}};  // linear index into Psi
-    {{psi ~ u}}[ip] = p0e{{p}}[ir] * {{psi ~ u}}[ip] + p1e{{p}}[ir] * dH{{v ~ r}};
-    {{psi ~ v}}[ip] = p0e{{p}}[ir] * {{psi ~ v}}[ip] + p1e{{p}}[ir] * dH{{u ~ r}};
+    {{psi ~ u}}[ip] = p{{r}}0e{{p}}[ir] * {{psi ~ u}}[ip] + p{{r}}1e{{p}}[ir] * dH{{v ~ r}};
+    {{psi ~ v}}[ip] = p{{r}}0e{{p}}[ir] * {{psi ~ v}}[ip] + p{{r}}1e{{p}}[ir] * dH{{u ~ r}};
     pE{{u}}i {{se}}= {{psi ~ u}}[ip];
     pE{{v}}i {{sh}}= {{psi ~ v}}[ip];
 }
diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index 7c648b6..cdda3a9 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -78,8 +78,8 @@ if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}}_thickness ) {
 
     {%- endif %}
     const size_t ip = {{v}} + {{u}} * s{{v}} + ir * s{{v}} * s{{u}};  // linear index into Psi
-    {{psi ~ u}}[ip] = p0h{{p}}[ir] * {{psi ~ u}}[ip] + p1h{{p}}[ir] * dE{{v ~ r}};
-    {{psi ~ v}}[ip] = p0h{{p}}[ir] * {{psi ~ v}}[ip] + p1h{{p}}[ir] * dE{{u ~ r}};
+    {{psi ~ u}}[ip] = p{{r}}0h{{p}}[ir] * {{psi ~ u}}[ip] + p{{r}}1h{{p}}[ir] * dE{{v ~ r}};
+    {{psi ~ v}}[ip] = p{{r}}0h{{p}}[ir] * {{psi ~ v}}[ip] + p{{r}}1h{{p}}[ir] * dE{{u ~ r}};
     pH{{u}}i {{sh}}= {{psi ~ u}}[ip];
     pH{{v}}i {{se}}= {{psi ~ v}}[ip];
 }

From a67009d1c889208f1d6ec5fe2d91788d24a898b5 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Sun, 1 Oct 2017 13:14:27 -0700
Subject: [PATCH 13/37] fix declaration

---
 opencl_fdtd/kernels/update_e.cl | 2 +-
 opencl_fdtd/kernels/update_h.cl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index 63a0c6f..eed4ee5 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -50,7 +50,7 @@ ftype pEzi = 0;
         {%- set se, sh = '+', '-' -%}
     {%- endif -%}
 
-pml_{{r ~ p}}_thickness = {{pml['thickness']}};
+int pml_{{r ~ p}}_thickness = {{pml['thickness']}};
 
     {%- if p == 'n' %}
 
diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index cdda3a9..27d7a47 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -64,7 +64,7 @@ ftype pHzi = 0;
         {%- set se, sh = '+', '-' -%}
     {%- endif -%}
 
-pml_{{r ~ p}}_thickness = {{pml['thickness']}};
+int pml_{{r ~ p}}_thickness = {{pml['thickness']}};
 
     {%- if p == 'n' %}
 

From aab57412a5b1ab39a4f2426c697612f20f1f3221 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Mon, 15 Jan 2018 22:36:40 -0800
Subject: [PATCH 14/37] move code to new location

---
 README.md        | 8 ++++----
 requirements.txt | 8 ++++----
 setup.py         | 2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 26d5acd..731450b 100644
--- a/README.md
+++ b/README.md
@@ -27,16 +27,16 @@ electromagnetic simulations on parallel compute hardware (mainly GPUs).
 * numpy
 * pyopencl
 * jinja2
-* [fdfd_tools](https://mpxd.net/gogs/jan/fdfd_tools)
+* [fdfd_tools](https://mpxd.net/code/jan/fdfd_tools)
 
 Optional (used for examples):
 * dill (for file output)
-* [gridlock](https://mpxd.net/gogs/jan/gridlock)
-* [masque](https://mpxd.net/gogs/jan/masque)
+* [gridlock](https://mpxd.net/code/jan/gridlock)
+* [masque](https://mpxd.net/code/jan/masque)
 
 To get the code, just clone this repository:
 ```bash
-git clone https://mpxd.net/gogs/jan/opencl_fdtd.git
+git clone https://mpxd.net/code/jan/opencl_fdtd.git
 ```
 
 You can install the requirements and their dependencies easily with
diff --git a/requirements.txt b/requirements.txt
index 6ed651b..6626171 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ numpy
 h5py
 pyopencl
 jinja2
--e git+https://mpxd.net/gogs/jan/float_raster.git@release#egg=float_raster
--e git+https://mpxd.net/gogs/jan/gridlock.git@release#egg=gridlock
--e git+https://mpxd.net/gogs/jan/masque.git@release#egg=masque
--e git+https://mpxd.net/gogs/jan/fdfd_tools.git@master#egg=fdfd_tools
+-e git+https://mpxd.net/code/jan/float_raster.git@release#egg=float_raster
+-e git+https://mpxd.net/code/jan/gridlock.git@release#egg=gridlock
+-e git+https://mpxd.net/code/jan/masque.git@release#egg=masque
+-e git+https://mpxd.net/code/jan/fdfd_tools.git@master#egg=fdfd_tools
diff --git a/setup.py b/setup.py
index a2f9f9d..55f5af9 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@ setup(name='opencl_fdtd',
       description='OpenCL FDTD solver',
       author='Jan Petykiewicz',
       author_email='anewusername@gmail.com',
-      url='https://mpxd.net/gogs/jan/opencl_fdtd',
+      url='https://mpxd.net/code/jan/opencl_fdtd',
       packages=find_packages(),
       package_data={
           'opencl_fdfd': ['kernels/*']

From c8298d916fbec02925030d977288ab411d6d49c9 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Sun, 16 Sep 2018 20:12:26 -0700
Subject: [PATCH 15/37] Use python3 for setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 55f5af9..c7e55f3 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 from setuptools import setup, find_packages
 

From d0b303523e735193c021a7b7de64841dbcc47bc3 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Sun, 16 Sep 2018 20:12:41 -0700
Subject: [PATCH 16/37] Move version string into module

---
 opencl_fdtd/__init__.py | 4 ++++
 setup.py                | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/opencl_fdtd/__init__.py b/opencl_fdtd/__init__.py
index e3e95b4..b621c19 100644
--- a/opencl_fdtd/__init__.py
+++ b/opencl_fdtd/__init__.py
@@ -1 +1,5 @@
 from .simulation import Simulation, type_to_C
+
+__author__ = 'Jan Petykiewicz'
+
+version = '0.4'
diff --git a/setup.py b/setup.py
index c7e55f3..8b098db 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 
 from setuptools import setup, find_packages
+import opencl_fdtd
 
 setup(name='opencl_fdtd',
-      version='0.4',
+      version=opencl_fdtd.version,
       description='OpenCL FDTD solver',
       author='Jan Petykiewicz',
       author_email='anewusername@gmail.com',

From 1de6fb0e39f39ad2b047116668ab44de0a5429e3 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Sun, 16 Sep 2018 20:12:52 -0700
Subject: [PATCH 17/37] Use readme as long_description

---
 setup.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/setup.py b/setup.py
index 8b098db..b7fd3cf 100644
--- a/setup.py
+++ b/setup.py
@@ -3,9 +3,14 @@
 from setuptools import setup, find_packages
 import opencl_fdtd
 
+with open('README.md', 'r') as f:
+    long_description = f.read()
+
 setup(name='opencl_fdtd',
       version=opencl_fdtd.version,
       description='OpenCL FDTD solver',
+      long_description=long_description,
+      long_description_content_type='text/markdown',
       author='Jan Petykiewicz',
       author_email='anewusername@gmail.com',
       url='https://mpxd.net/code/jan/opencl_fdtd',

From ea5e298023e74eb3c1865658cfdaedb1f06d9c58 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Thu, 29 Nov 2018 01:35:20 -0800
Subject: [PATCH 18/37] Explicitly cast to int

---
 opencl_fdtd/kernels/common.cl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/opencl_fdtd/kernels/common.cl b/opencl_fdtd/kernels/common.cl
index d203ca7..deecec9 100644
--- a/opencl_fdtd/kernels/common.cl
+++ b/opencl_fdtd/kernels/common.cl
@@ -73,9 +73,9 @@ __global ftype *Hz = H + ZZ;
  * the cell x_{+1} == 0 instead, ie. px = -(sx - 1) * dix .
  */
 {% for r in 'xyz' %}
-int m{{r}} = -di{{r}};
-int p{{r}} = +di{{r}};
-int wrap_{{r}} = (s{{r}} - 1) * di{{r}};
+int m{{r}} = - (int) di{{r}};
+int p{{r}} = + (int) di{{r}};
+int wrap_{{r}} = (s{{r}} - 1) * (int) di{{r}};
 if ( {{r}} == 0 ) {
   m{{r}} = wrap_{{r}};
 } else if ( {{r}} == s{{r}} - 1 ) {

From 1e874cb0c029cae2beeb4a0fae5d60de53106a83 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Thu, 29 Nov 2018 01:36:13 -0800
Subject: [PATCH 19/37] Fix sign on H component of PML

---
 opencl_fdtd/kernels/update_h.cl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index 27d7a47..968c2b6 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -96,9 +96,9 @@ ftype Hz_old = Hz[i];
 {%- endif %}
 
 // H update equations
-Hx[i] -= dt * (dEzy - dEyz + pHxi);
-Hy[i] -= dt * (dExz - dEzx + pHyi);
-Hz[i] -= dt * (dEyx - dExy + pHzi);
+Hx[i] -= dt * (dEzy - dEyz - pHxi);
+Hy[i] -= dt * (dExz - dEzx - pHyi);
+Hz[i] -= dt * (dEyx - dExy - pHzi);
 
 {% if do_poynting -%}
 // Average H across timesteps

From 2b1d906665c6394561b613ba62af832bbfb72ea3 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Thu, 29 Nov 2018 01:38:28 -0800
Subject: [PATCH 20/37] Add _create_field() and _create_eps()

---
 opencl_fdtd/simulation.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 78460ec..525d818 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -273,7 +273,24 @@ class Simulation(object):
                                          arguments=', '.join(S_args.keys()))
 
             self.update_S = lambda e: S_update(*S_args.values(), wait_for=e)
+    def _create_eps(self, epsilon: List[numpy.ndarray]):
+        if len(epsilon) != 3:
+            raise Exception('Epsilon must be a list with length of 3')
+        if not all((e.shape == epsilon[0].shape for e in epsilon[1:])):
+            raise Exception('All epsilon grids must have the same shape. Shapes are {}', [e.shape for e in epsilon])
+        if not epsilon[0].shape == self.shape:
+            raise Exception('Epsilon shape mismatch. Expected {}, got {}'.format(self.shape, epsilon[0].shape))
+        self.eps = pyopencl.array.to_device(self.queue, vec(epsilon).astype(self.arg_type))
 
+    def _create_field(self, initial_value: List[numpy.ndarray] = None):
+        if initial_value is None:
+            return pyopencl.array.zeros_like(self.eps)
+        else:
+            if len(initial_value) != 3:
+                Exception('Initial field value must be a list of length 3')
+            if not all((f.shape == self.shape for f in initial_value)):
+                Exception('Initial field list elements must have same shape as epsilon elements')
+            return pyopencl.array.to_device(self.queue, vec(initial_value).astype(self.arg_type))
 
 def type_to_C(float_type) -> str:
     """

From f00c8b4a3eb921fc8ed118f8b7f240178a5f3c47 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Thu, 29 Nov 2018 01:59:05 -0800
Subject: [PATCH 21/37] Add _create_context(), _create_operation(), and
 _create_pmls(), and generalize initial field value args

---
 opencl_fdtd/simulation.py | 116 ++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 66 deletions(-)

diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 525d818..7b94c64 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -76,8 +76,7 @@ class Simulation(object):
                  epsilon: List[numpy.ndarray],
                  pmls: List[Dict[str, int or float]],
                  dt: float = .99/numpy.sqrt(3),
-                 initial_E: List[numpy.ndarray] = None,
-                 initial_H: List[numpy.ndarray] = None,
+                 initial_fields: Dict[str, List[numpy.ndarray]] = None,
                  context: pyopencl.Context = None,
                  queue: pyopencl.CommandQueue = None,
                  float_type: numpy.float32 or numpy.float64 = numpy.float32,
@@ -113,21 +112,14 @@ class Simulation(object):
                     * GPU memory requirements are approximately doubled, since S and the intermediate
                         products must be stored.
         """
+        if initial_fields is None:
+            initial_fields = {}
 
-        if len(epsilon) != 3:
-            Exception('Epsilon must be a list with length of 3')
-        if not all((e.shape == epsilon[0].shape for e in epsilon[1:])):
-            Exception('All epsilon grids must have the same shape. Shapes are {}', [e.shape for e in epsilon])
-
-        if context is None:
-            self.context = pyopencl.create_some_context()
-        else:
-            self.context = context
-
-        if queue is None:
-            self.queue = pyopencl.CommandQueue(self.context)
-        else:
-            self.queue = queue
+        self.shape = epsilon[0].shape
+        self.arg_type = float_type
+        self.sources = {}
+        self._create_context(context, queue)
+        self._create_eps(epsilon)
 
         if dt > .99/numpy.sqrt(3):
             warnings.warn('Warning: unstable dt: {}'.format(dt))
@@ -136,27 +128,8 @@ class Simulation(object):
         else:
             self.dt = dt
 
-        self.arg_type = float_type
-        self.sources = {}
-        self.eps = pyopencl.array.to_device(self.queue, vec(epsilon).astype(float_type))
-
-        if initial_E is None:
-            self.E = pyopencl.array.zeros_like(self.eps)
-        else:
-            if len(initial_E) != 3:
-                Exception('Initial_E must be a list of length 3')
-            if not all((E.shape == epsilon[0].shape for E in initial_E)):
-                Exception('Initial_E list elements must have same shape as epsilon elements')
-            self.E = pyopencl.array.to_device(self.queue, vec(E).astype(float_type))
-
-        if initial_H is None:
-            self.H = pyopencl.array.zeros_like(self.eps)
-        else:
-            if len(initial_H) != 3:
-                Exception('Initial_H must be a list of length 3')
-            if not all((H.shape == epsilon[0].shape for H in initial_H)):
-                Exception('Initial_H list elements must have same shape as epsilon elements')
-            self.H = pyopencl.array.to_device(self.queue, vec(H).astype(float_type))
+        self.E = self._create_field(initial_fields.get('E', None))
+        self.H = self._create_field(initial_fields.get('H', None))
 
         for pml in pmls:
             pml.setdefault('thickness', 8)
@@ -181,7 +154,7 @@ class Simulation(object):
 
         common_source = jinja_env.get_template('common.cl').render(
                 ftype=ctype,
-                shape=epsilon[0].shape,
+                shape=self.shape,
                 )
         jinja_args = {
                 'common_header': common_source,
@@ -194,21 +167,37 @@ class Simulation(object):
         self.sources['E'] = E_source
         self.sources['H'] = H_source
 
+
+
+        S_fields = OrderedDict()
         if do_poynting:
             S_source = jinja_env.get_template('update_s.cl').render(**jinja_args)
             self.sources['S'] = S_source
 
-            self.oS = pyopencl.array.zeros(self.queue, self.E.shape + (2,), dtype=float_type)
+            self.oS = pyopencl.array.zeros(self.queue, self.E.shape + (2,), dtype=self.arg_type)
             self.S = pyopencl.array.zeros_like(self.E)
-            S_fields = OrderedDict()
             S_fields[ptr('oS')] = self.oS
             S_fields[ptr('S')] = self.S
-        else:
-            S_fields = OrderedDict()
 
         '''
         PML
         '''
+        pml_e_fields, pml_h_fields = self._create_pmls(pmls)
+
+        '''
+        Create operations
+        '''
+        self.update_E = self._create_operation(E_source, (base_fields, eps_field, pml_e_fields))
+        self.update_H = self._create_operation(H_source, (base_fields, pml_h_fields, S_fields))
+        if do_poynting:
+            self.update_S = self._create_operation(S_source, (base_fields, S_fields))
+
+
+    def _create_pmls(self, pmls):
+        ctype = type_to_C(self.arg_type)
+        def ptr(arg: str) -> str:
+            return ctype + ' *' + arg
+
         pml_e_fields = OrderedDict()
         pml_h_fields = OrderedDict()
         for pml in pmls:
@@ -225,7 +214,7 @@ class Simulation(object):
                 p1 = sigma / (sigma + alpha) * (p0 - 1)
                 return p0, p1
 
-            xe, xh = (numpy.arange(1, pml['thickness'] + 1, dtype=float_type)[::-1] for _ in range(2))
+            xe, xh = (numpy.arange(1, pml['thickness'] + 1, dtype=self.arg_type)[::-1] for _ in range(2))
             if pml['polarity'] == 'p':
                 xe -= 0.5
             elif pml['polarity'] == 'n':
@@ -240,39 +229,34 @@ class Simulation(object):
             psi_base = 'Psi_' + pml['axis'] + pml['polarity'] + '_'
             psi_names = [[psi_base + eh + c for c in uv] for eh in 'EH']
 
-            psi_shape = list(epsilon[0].shape)
+            psi_shape = list(self.shape)
             psi_shape[a] = pml['thickness']
 
             for ne, nh in zip(*psi_names):
                 pml_e_fields[ptr(ne)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)
                 pml_h_fields[ptr(nh)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)
+        return pml_e_fields, pml_h_fields
 
-        self.pml_e_fields = pml_e_fields
-        self.pml_h_fields = pml_h_fields
+    def _create_operation(self, source, args_fields):
+        args = OrderedDict()
+        [args.update(d) for d in args_fields]
+        update = ElementwiseKernel(self.context, operation=source,
+                                   arguments=', '.join(args.keys()))
+        return lambda e: update(*args.values(), wait_for=e)
 
 
-        '''
-        Create operations
-        '''
-        E_args = OrderedDict()
-        [E_args.update(d) for d in (base_fields, eps_field, pml_e_fields)]
-        E_update = ElementwiseKernel(self.context, operation=E_source,
-                                     arguments=', '.join(E_args.keys()))
+    def _create_context(self, context: pyopencl.Context = None,
+                        queue: pyopencl.CommandQueue = None):
+        if context is None:
+            self.context = pyopencl.create_some_context()
+        else:
+            self.context = context
 
-        H_args = OrderedDict()
-        [H_args.update(d) for d in (base_fields, pml_h_fields, S_fields)]
-        H_update = ElementwiseKernel(self.context, operation=H_source,
-                                     arguments=', '.join(H_args.keys()))
-        self.update_E = lambda e: E_update(*E_args.values(), wait_for=e)
-        self.update_H = lambda e: H_update(*H_args.values(), wait_for=e)
+        if queue is None:
+            self.queue = pyopencl.CommandQueue(self.context)
+        else:
+            self.queue = queue
 
-        if do_poynting:
-            S_args = OrderedDict()
-            [S_args.update(d) for d in (base_fields, S_fields)]
-            S_update = ElementwiseKernel(self.context, operation=S_source,
-                                         arguments=', '.join(S_args.keys()))
-
-            self.update_S = lambda e: S_update(*S_args.values(), wait_for=e)
     def _create_eps(self, epsilon: List[numpy.ndarray]):
         if len(epsilon) != 3:
             raise Exception('Epsilon must be a list with length of 3')

From cb471df1827ac726e3a36d72a9868177897b90ba Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Thu, 29 Nov 2018 02:00:30 -0800
Subject: [PATCH 22/37] Implement proper kappa for PML

---
 opencl_fdtd/kernels/update_e.cl |  2 ++
 opencl_fdtd/kernels/update_h.cl |  4 ++++
 opencl_fdtd/simulation.py       | 19 +++++++++++--------
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index eed4ee5..5a9ed1a 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -64,6 +64,8 @@ if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}}_thickness ) {
 
     {%- endif %}
     const size_t ip = {{v}} + {{u}} * s{{v}} + ir * s{{v}} * s{{u}};  // linear index into Psi
+    dH{{v ~ r}} *= p{{r}}2e{{p}}[ir];
+    dH{{u ~ r}} *= p{{r}}2e{{p}}[ir];
     {{psi ~ u}}[ip] = p{{r}}0e{{p}}[ir] * {{psi ~ u}}[ip] + p{{r}}1e{{p}}[ir] * dH{{v ~ r}};
     {{psi ~ v}}[ip] = p{{r}}0e{{p}}[ir] * {{psi ~ v}}[ip] + p{{r}}1e{{p}}[ir] * dH{{u ~ r}};
     pE{{u}}i {{se}}= {{psi ~ u}}[ip];
diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index 968c2b6..56d9bbc 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -45,6 +45,8 @@ ftype aEzy = Ez[i + py] + Ez[i];
 {%- endif %}
 
 
+
+
 /*
  *  PML Update
  */
@@ -78,6 +80,8 @@ if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}}_thickness ) {
 
     {%- endif %}
     const size_t ip = {{v}} + {{u}} * s{{v}} + ir * s{{v}} * s{{u}};  // linear index into Psi
+    dE{{v ~ r}} *= p{{r}}2h{{p}}[ir];
+    dE{{u ~ r}} *= p{{r}}2h{{p}}[ir];
     {{psi ~ u}}[ip] = p{{r}}0h{{p}}[ir] * {{psi ~ u}}[ip] + p{{r}}1h{{p}}[ir] * dE{{v ~ r}};
     {{psi ~ v}}[ip] = p{{r}}0h{{p}}[ir] * {{psi ~ v}}[ip] + p{{r}}1h{{p}}[ir] * dE{{u ~ r}};
     pH{{u}}i {{sh}}= {{psi ~ u}}[ip];
diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 7b94c64..1b6d9a2 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -203,16 +203,19 @@ class Simulation(object):
         for pml in pmls:
             a = 'xyz'.find(pml['axis'])
 
-            sigma_max = -pml['ln_R_per_layer'] / 2 * (pml['m'] + 1) / \
-                    numpy.sqrt(pml['epsilon_eff'] * pml['mu_eff'])
-            alpha_max = 0           # TODO: Nonzero alpha
+            sigma_max = -pml['ln_R_per_layer'] / 2 * (pml['m'] + 1)
+            kappa_max = numpy.sqrt(pml['mu_eff'] * pml['epsilon_eff'])
+            alpha_max = 0           # TODO: Nonzero alpha?
 
             def par(x):
-                sigma = ((x / pml['thickness']) ** pml['m']) * sigma_max
+                scaling = ((x / (pml['thickness'])) ** pml['m'])
+                sigma = scaling * sigma_max
+                kappa = 1 + scaling * (kappa_max - 1)
                 alpha = ((1 - x / pml['thickness']) ** pml['ma']) * alpha_max
-                p0 = numpy.exp(-(sigma + alpha) * dt)
-                p1 = sigma / (sigma + alpha) * (p0 - 1)
-                return p0, p1
+                p0 = numpy.exp(-(sigma / kappa + alpha) * self.dt)
+                p1 = sigma / (sigma + kappa * alpha) * (p0 - 1)
+                p2 = 1/kappa
+                return p0, p1, p2
 
             xe, xh = (numpy.arange(1, pml['thickness'] + 1, dtype=self.arg_type)[::-1] for _ in range(2))
             if pml['polarity'] == 'p':
@@ -220,7 +223,7 @@ class Simulation(object):
             elif pml['polarity'] == 'n':
                 xh -= 0.5
 
-            pml_p_names = [['p' + pml['axis'] + i + eh + pml['polarity'] for i in '01'] for eh in 'eh']
+            pml_p_names = [['p' + pml['axis'] + i + eh + pml['polarity'] for i in '012'] for eh in 'eh']
             for name_e, name_h, pe, ph in zip(pml_p_names[0], pml_p_names[1], par(xe), par(xh)):
                 pml_e_fields[ptr(name_e)] = pyopencl.array.to_device(self.queue, pe)
                 pml_h_fields[ptr(name_h)] = pyopencl.array.to_device(self.queue, ph)

From d8dc024626f872d2a9310745c96d02413b1e1982 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Thu, 29 Nov 2018 02:01:28 -0800
Subject: [PATCH 23/37] Implementation of "source" fields (J)

---
 opencl_fdtd/kernels/update_j.cl | 32 ++++++++++++++++++++++++++++++++
 opencl_fdtd/simulation.py       | 24 +++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 opencl_fdtd/kernels/update_j.cl

diff --git a/opencl_fdtd/kernels/update_j.cl b/opencl_fdtd/kernels/update_j.cl
new file mode 100644
index 0000000..4d7f64d
--- /dev/null
+++ b/opencl_fdtd/kernels/update_j.cl
@@ -0,0 +1,32 @@
+/*
+ *  Update E-field from J field 
+ *
+ *  Template parameters:
+ *   common_header: Rendered contents of common.cl
+ *
+ *  OpenCL args:
+ *   E, Jr, Ji, c, s, xmin, xmax, ymin, ymax, zmin, zmax   
+ */
+
+{{common_header}}
+
+////////////////////////////////////////////////////////////////////////////
+
+__global ftype *Jrx = Jr + XX;
+__global ftype *Jry = Jr + YY;
+__global ftype *Jrz = Jr + ZZ;
+__global ftype *Jix = Ji + XX;
+__global ftype *Jiy = Ji + YY;
+__global ftype *Jiz = Ji + ZZ;
+
+
+if (x < xmin || y < ymin || z < zmin) {
+   PYOPENCL_ELWISE_CONTINUE;
+}
+if (x >= xmax || y >= ymax || z >= zmax) {
+   PYOPENCL_ELWISE_CONTINUE;
+}
+
+Ex[i] += c * Jrx[i] + s * Jix[i];
+Ey[i] += c * Jry[i] + s * Jiy[i];
+Ez[i] += c * Jrz[i] + s * Jiz[i];
diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 1b6d9a2..6e12b2e 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -70,6 +70,7 @@ class Simulation(object):
     update_E = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
     update_H = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
     update_S = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
+    update_J = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
     sources = None      # type: Dict[str, str]
 
     def __init__(self,
@@ -80,7 +81,8 @@ class Simulation(object):
                  context: pyopencl.Context = None,
                  queue: pyopencl.CommandQueue = None,
                  float_type: numpy.float32 or numpy.float64 = numpy.float32,
-                 do_poynting: bool = True):
+                 do_poynting: bool = True,
+                 do_fieldsrc: bool = False):
         """
         Initialize the simulation.
 
@@ -179,6 +181,17 @@ class Simulation(object):
             S_fields[ptr('oS')] = self.oS
             S_fields[ptr('S')] = self.S
 
+        J_fields = OrderedDict()
+        if do_fieldsrc:
+            J_source = jinja_env.get_template('update_j.cl').render(**jinja_args)
+            self.sources['J'] = J_source
+
+            self.Ji = pyopencl.array.zeros_like(self.E)
+            self.Jr = pyopencl.array.zeros_like(self.E)
+            J_fields[ptr('Jr')] = self.Jr
+            J_fields[ptr('Ji')] = self.Ji
+
+
         '''
         PML
         '''
@@ -191,6 +204,15 @@ class Simulation(object):
         self.update_H = self._create_operation(H_source, (base_fields, pml_h_fields, S_fields))
         if do_poynting:
             self.update_S = self._create_operation(S_source, (base_fields, S_fields))
+        if do_fieldsrc:
+            args = OrderedDict()
+            [args.update(d) for d in (base_fields, J_fields)]
+            var_args = [ctype + ' ' + v for v in 'cs'] + ['uint ' + r + m for r in 'xyz' for m in ('min', 'max')]
+            print(var_args)
+            update = ElementwiseKernel(self.context, operation=J_source,
+                                       arguments=', '.join(list(args.keys()) + var_args))
+            #print(len(args.values()),'\n\n',  args.values(), args.keys())
+            self.update_J = lambda e, *a: update(*args.values(), *a, wait_for=e)
 
 
     def _create_pmls(self, pmls):

From 3ac20f62718f52d3ea710fb4fb24d135eb80faf7 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Thu, 29 Nov 2018 02:01:48 -0800
Subject: [PATCH 24/37] Add bloch boundaries (untested)

---
 opencl_fdtd/kernels/update_e.cl | 12 ++++++++++++
 opencl_fdtd/kernels/update_h.cl | 15 +++++++++++++++
 opencl_fdtd/simulation.py       | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index 5a9ed1a..c578414 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -31,6 +31,18 @@ ftype dHyz = Hy[i] - Hy[i + mz];
 ftype dHzx = Hz[i] - Hz[i + mx];
 ftype dHzy = Hz[i] - Hz[i + my];
 
+{% for bloch in bloch_boundaries -%}
+    {%- set r = bloch['axis'] -%}
+    {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
+if ({{r}} == 0) {
+    ftype bloch_im = {{bloch['real']}};
+    ftype bloch_re = {{bloch['imag']}};
+    dH{{u ~ r}} = bloch_re * dH{{v ~ r}} + bloch_im * (G{{u}}[i] - G{{u}}[i + m{{u}}]);
+    dH{{v ~ r}} = bloch_re * dH{{v ~ r}} + bloch_im * (G{{v}}[i] - G{{v}}[i + m{{v}}]);
+}
+{%- endfor %}
+
+
 /*
  *   PML Update
  */
diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index 56d9bbc..0ef15dd 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -28,6 +28,21 @@ ftype dEyz = Ey[i + pz] - Ey[i];
 ftype dEzx = Ez[i + px] - Ez[i];
 ftype dEzy = Ez[i + py] - Ez[i];
 
+
+{% for bloch in bloch_boundaries -%}
+    {%- set r = bloch['axis'] -%}
+    {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
+if ({{r}} == s{{r}} - 1) {
+    ftype bloch_re = {{bloch['real']}};
+    ftype bloch_im = {{bloch['imag']}};
+    dE{{u ~ r}} = bloch_re * dE{{u ~ r}} + bloch_im * (F{{u}}[i + p{{u}}] - F{{u}}[i]);
+    dE{{v ~ r}} = bloch_re * dE{{v ~ r}} + bloch_im * (F{{v}}[i + p{{v}}] - F{{v}}[i]);
+}
+{%- endfor %}
+
+
+
+
 {%- if do_poynting %}
 
 
diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 6e12b2e..9b23cad 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -76,6 +76,7 @@ class Simulation(object):
     def __init__(self,
                  epsilon: List[numpy.ndarray],
                  pmls: List[Dict[str, int or float]],
+                 bloch_boundaries: List[Dict[str, int or float]] = (),
                  dt: float = .99/numpy.sqrt(3),
                  initial_fields: Dict[str, List[numpy.ndarray]] = None,
                  context: pyopencl.Context = None,
@@ -132,6 +133,9 @@ class Simulation(object):
 
         self.E = self._create_field(initial_fields.get('E', None))
         self.H = self._create_field(initial_fields.get('H', None))
+        if bloch_boundaries:
+            self.F = self._create_field(initial_fields.get('F', None))
+            self.G = self._create_field(initial_fields.get('G', None))
 
         for pml in pmls:
             pml.setdefault('thickness', 8)
@@ -154,6 +158,17 @@ class Simulation(object):
         eps_field = OrderedDict()
         eps_field[ptr('eps')] = self.eps
 
+        if bloch_boundaries:
+            base_fields[ptr('F')] = self.F
+            base_fields[ptr('G')] = self.G
+
+            bloch_fields = OrderedDict()
+            bloch_fields[ptr('E')] = self.F
+            bloch_fields[ptr('H')] = self.G
+            bloch_fields[ctype + ' dt'] = self.dt
+            bloch_fields[ptr('F')] = self.E
+            bloch_fields[ptr('G')] = self.H
+
         common_source = jinja_env.get_template('common.cl').render(
                 ftype=ctype,
                 shape=self.shape,
@@ -162,13 +177,24 @@ class Simulation(object):
                 'common_header': common_source,
                 'pmls': pmls,
                 'do_poynting': do_poynting,
+                'bloch': bloch_boundaries,
                 }
         E_source = jinja_env.get_template('update_e.cl').render(**jinja_args)
         H_source = jinja_env.get_template('update_h.cl').render(**jinja_args)
-
         self.sources['E'] = E_source
         self.sources['H'] = H_source
 
+        if bloch_boundaries:
+            bloch_args = jinja_args.copy()
+            bloch_args['do_poynting'] = False
+            bloch_args['bloch'] = [{'axis': b['axis'],
+                                    'real': b['imag'],
+                                    'imag': b['real']}
+                                   for b in bloch_boundaries]
+            F_source = jinja_env.get_template('update_e.cl').render(**bloch_args)
+            G_source = jinja_env.get_template('update_h.cl').render(**bloch_args)
+            self.sources['F'] = F_source
+            self.sources['G'] = G_source
 
 
         S_fields = OrderedDict()
@@ -196,6 +222,8 @@ class Simulation(object):
         PML
         '''
         pml_e_fields, pml_h_fields = self._create_pmls(pmls)
+        if bloch_boundaries:
+            pml_f_fields, pml_g_fields = self._create_pmls(pmls)
 
         '''
         Create operations
@@ -204,6 +232,9 @@ class Simulation(object):
         self.update_H = self._create_operation(H_source, (base_fields, pml_h_fields, S_fields))
         if do_poynting:
             self.update_S = self._create_operation(S_source, (base_fields, S_fields))
+        if bloch_boundaries:
+            self.update_F = self._create_operation(F_source, (bloch_fields, eps_field, pml_f_fields))
+            self.update_G = self._create_operation(G_source, (bloch_fields, pml_g_fields))
         if do_fieldsrc:
             args = OrderedDict()
             [args.update(d) for d in (base_fields, J_fields)]

From 5ffe547640c78af55a7a8e802f8a503cee21e950 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Fri, 30 Nov 2018 00:13:27 -0800
Subject: [PATCH 25/37] Typo fixes and minor comment updates

---
 opencl_fdtd/kernels/update_e.cl |  4 ++--
 opencl_fdtd/kernels/update_h.cl |  2 +-
 opencl_fdtd/simulation.py       | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index c578414..f44527d 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -4,7 +4,7 @@
  *  Template parameters:
  *   common_header: Rendered contents of common.cl
  *   pmls: [{'axis': 'x', 'polarity': 'n', 'thickness': 8}, ...] list of pml dicts containing
-        axes, polarities, and thicknesses.
+ *      axes, polarities, and thicknesses.
  *
  *  OpenCL args:
  *   E, H, dt, eps, [p{01}e{np}, Psi_{xyz}{np}_E]
@@ -20,7 +20,7 @@ __global ftype *epsz = eps + ZZ;
 
 
 /*
- *   Precalclate derivatives
+ *   Precalculate derivatives
  */
 ftype dHxy = Hx[i] - Hx[i + my];
 ftype dHxz = Hx[i] - Hx[i + mz];
diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index 0ef15dd..f1c19cb 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -5,7 +5,7 @@
  *  Template parameters:
  *   common_header: Rendered contents of common.cl
  *   pmls: [{'axis': 'x', 'polarity': 'n', 'thickness': 8}, ...] list of pml dicts containing
-        axes, polarities, and thicknesses.
+ *      axes, polarities, and thicknesses.
  *   do_poynting: Whether to precalculate poynting vector components (boolean)
  *
  *  OpenCL args:
diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 9b23cad..0eaef5d 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -56,11 +56,11 @@ class Simulation(object):
      event0 and event1 to occur (i.e. previous operations to finish) before starting execution.
      event2 can then be used to prepare further operations to be run after update_H.
     """
-    E = None    # type: List[pyopencl.array.Array]
-    H = None    # type: List[pyopencl.array.Array]
-    S = None    # type: List[pyopencl.array.Array]
-    eps = None  # type: List[pyopencl.array.Array]
-    dt = None   # type: float
+    E = None            # type: pyopencl.array.Array
+    H = None            # type: pyopencl.array.Array
+    S = None            # type: pyopencl.array.Array
+    eps = None          # type: pyopencl.array.Array
+    dt = None           # type: float
 
     arg_type = None     # type: numpy.float32 or numpy.float64
 

From f5a5a0ad04176e8ebbca15c8e4a53300e5b2f5e6 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Fri, 30 Nov 2018 00:36:11 -0800
Subject: [PATCH 26/37] Enable nonuniform grids (minimally tested)

---
 opencl_fdtd/kernels/update_e.cl | 28 +++++++++++++++++++++-------
 opencl_fdtd/kernels/update_h.cl | 28 +++++++++++++++++++++-------
 opencl_fdtd/simulation.py       | 28 +++++++++++++++++++++++++---
 3 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/opencl_fdtd/kernels/update_e.cl b/opencl_fdtd/kernels/update_e.cl
index f44527d..9127181 100644
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@@ -5,9 +5,12 @@
  *   common_header: Rendered contents of common.cl
  *   pmls: [{'axis': 'x', 'polarity': 'n', 'thickness': 8}, ...] list of pml dicts containing
  *      axes, polarities, and thicknesses.
+ *   uniform_dx: If grid is uniform, uniform_dx should be the grid spacing.
+ *      Otherwise, uniform_dx should be False and [inv_dh{xyz}] arrays must be supplied as
+ *      OpenCL args.
  *
  *  OpenCL args:
- *   E, H, dt, eps, [p{01}e{np}, Psi_{xyz}{np}_E]
+ *   E, H, dt, eps, [p{012}e{np}, Psi_{xyz}{np}_E], [inv_dh{xyz}]
  */
 
 {{common_header}}
@@ -19,17 +22,28 @@ __global ftype *epsy = eps + YY;
 __global ftype *epsz = eps + ZZ;
 
 
+{%- if uniform_dx %}
+ftype inv_dx = 1.0 / {{uniform_dx}};
+ftype inv_dy = 1.0 / {{uniform_dx}};
+ftype inv_dz = 1.0 / {{uniform_dx}};
+{%- else %}
+ftype inv_dx = inv_dhx[x];
+ftype inv_dy = inv_dhy[y];
+ftype inv_dz = inv_dhz[z];
+{%- endif %}
+
+
 /*
  *   Precalculate derivatives
  */
-ftype dHxy = Hx[i] - Hx[i + my];
-ftype dHxz = Hx[i] - Hx[i + mz];
+ftype dHxy = (Hx[i] - Hx[i + my]) * inv_dy;
+ftype dHxz = (Hx[i] - Hx[i + mz]) * inv_dz;
 
-ftype dHyx = Hy[i] - Hy[i + mx];
-ftype dHyz = Hy[i] - Hy[i + mz];
+ftype dHyx = (Hy[i] - Hy[i + mx]) * inv_dx;
+ftype dHyz = (Hy[i] - Hy[i + mz]) * inv_dz;
 
-ftype dHzx = Hz[i] - Hz[i + mx];
-ftype dHzy = Hz[i] - Hz[i + my];
+ftype dHzx = (Hz[i] - Hz[i + mx]) * inv_dx;
+ftype dHzy = (Hz[i] - Hz[i + my]) * inv_dy;
 
 {% for bloch in bloch_boundaries -%}
     {%- set r = bloch['axis'] -%}
diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index f1c19cb..ac75157 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -7,9 +7,12 @@
  *   pmls: [{'axis': 'x', 'polarity': 'n', 'thickness': 8}, ...] list of pml dicts containing
  *      axes, polarities, and thicknesses.
  *   do_poynting: Whether to precalculate poynting vector components (boolean)
+ *   uniform_dx: If grid is uniform, uniform_dx should be the grid spacing.
+ *      Otherwise, uniform_dx should be False and [inv_de{xyz}] arrays must be supplied as
+ *      OpenCL args.
  *
  *  OpenCL args:
- *   E, H, dt, [p{xyz}{01}h{np}, Psi_{xyz}{np}_H], [oS]
+ *   E, H, dt, [inv_de{xyz}], [p{xyz}{01}h{np}, Psi_{xyz}{np}_H], [oS]
  */
 
 {{common_header}}
@@ -19,14 +22,25 @@
 /*
  *  Precalculate derivatives
  */
-ftype dExy = Ex[i + py] - Ex[i];
-ftype dExz = Ex[i + pz] - Ex[i];
+{%- if uniform_dx %}
+ftype inv_dx = 1.0 / {{uniform_dx}};
+ftype inv_dy = 1.0 / {{uniform_dx}};
+ftype inv_dz = 1.0 / {{uniform_dx}};
+{%- else %}
+ftype inv_dx = inv_dex[x];
+ftype inv_dy = inv_dey[y];
+ftype inv_dz = inv_dez[z];
+{%- endif %}
 
-ftype dEyx = Ey[i + px] - Ey[i];
-ftype dEyz = Ey[i + pz] - Ey[i];
 
-ftype dEzx = Ez[i + px] - Ez[i];
-ftype dEzy = Ez[i + py] - Ez[i];
+ftype dExy = (Ex[i + py] - Ex[i]) * inv_dy;
+ftype dExz = (Ex[i + pz] - Ex[i]) * inv_dz;
+
+ftype dEyx = (Ey[i + px] - Ey[i]) * inv_dx;
+ftype dEyz = (Ey[i + pz] - Ey[i]) * inv_dz;
+
+ftype dEzx = (Ez[i + px] - Ez[i]) * inv_dx;
+ftype dEzy = (Ez[i + py] - Ez[i]) * inv_dy;
 
 
 {% for bloch in bloch_boundaries -%}
diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 0eaef5d..940725a 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -61,6 +61,7 @@ class Simulation(object):
     S = None            # type: pyopencl.array.Array
     eps = None          # type: pyopencl.array.Array
     dt = None           # type: float
+    inv_dxes = None     # type: List[pyopencl.array.Array]
 
     arg_type = None     # type: numpy.float32 or numpy.float64
 
@@ -77,7 +78,8 @@ class Simulation(object):
                  epsilon: List[numpy.ndarray],
                  pmls: List[Dict[str, int or float]],
                  bloch_boundaries: List[Dict[str, int or float]] = (),
-                 dt: float = .99/numpy.sqrt(3),
+                 dxes: List[List[numpy.ndarray]] or float = None,
+                 dt: float = None,
                  initial_fields: Dict[str, List[numpy.ndarray]] = None,
                  context: pyopencl.Context = None,
                  queue: pyopencl.CommandQueue = None,
@@ -98,7 +100,7 @@ class Simulation(object):
             'ln_R_per_layer': Desired (ln(R) / thickness) value. Default -1.6.
             'm': Polynomial grading exponent. Default 3.5.
             'ma': Exponent for alpha. Default 1.
-        :param dt: Time step. Default is .99/sqrt(3).
+        :param dt: Time step. Default is min(dxes) * .99/sqrt(3).
         :param initial_E: Initial E-field (default is 0 everywhere). Same format as epsilon.
         :param initial_H: Initial H-field (default is 0 everywhere). Same format as epsilon.
         :param context: pyOpenCL context. If not given, pyopencl.create_some_context(False) is called.
@@ -124,7 +126,22 @@ class Simulation(object):
         self._create_context(context, queue)
         self._create_eps(epsilon)
 
-        if dt > .99/numpy.sqrt(3):
+        if dxes is None:
+            dxes = 1.0
+
+        if isinstance(dxes, (float, int)):
+            uniform_dx = dxes
+            min_dx = dxes
+        else:
+            uniform_dx = False
+            self.inv_dxes = [self._create_field(1 / dxn) for dxn in dxes[0] + dxes[1]]
+            min_dx = min(min(dxn) for dxn in dxes[0] + dxes[1])
+
+        max_dt = min_dx * .99 / numpy.sqrt(3)
+
+        if dt is None:
+            self.dt = max_dt
+        elif dt > max_dt:
             warnings.warn('Warning: unstable dt: {}'.format(dt))
         elif dt <= 0:
             raise Exception('Invalid dt: {}'.format(dt))
@@ -154,6 +171,10 @@ class Simulation(object):
         base_fields[ptr('E')] = self.E
         base_fields[ptr('H')] = self.H
         base_fields[ctype + ' dt'] = self.dt
+        if uniform_dx == False:
+            inv_dx_names = ['inv_d' + eh + r for eh in 'eh' for r in 'xyz']
+            for name, field in zip(inv_dx_names, self.inv_dxes):
+                base_fields[ptr(name)] = field
 
         eps_field = OrderedDict()
         eps_field[ptr('eps')] = self.eps
@@ -178,6 +199,7 @@ class Simulation(object):
                 'pmls': pmls,
                 'do_poynting': do_poynting,
                 'bloch': bloch_boundaries,
+                'uniform_dx': uniform_dx,
                 }
         E_source = jinja_env.get_template('update_e.cl').render(**jinja_args)
         H_source = jinja_env.get_template('update_h.cl').render(**jinja_args)

From 65cc1183921b6107b8487fc762a1606bb6512d8a Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Fri, 30 Nov 2018 00:36:22 -0800
Subject: [PATCH 27/37] Remove debug code

---
 opencl_fdtd/simulation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 940725a..087e9c4 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -261,10 +261,8 @@ class Simulation(object):
             args = OrderedDict()
             [args.update(d) for d in (base_fields, J_fields)]
             var_args = [ctype + ' ' + v for v in 'cs'] + ['uint ' + r + m for r in 'xyz' for m in ('min', 'max')]
-            print(var_args)
             update = ElementwiseKernel(self.context, operation=J_source,
                                        arguments=', '.join(list(args.keys()) + var_args))
-            #print(len(args.values()),'\n\n',  args.values(), args.keys())
             self.update_J = lambda e, *a: update(*args.values(), *a, wait_for=e)
 
 

From 9cf50b1d88f2745cbf8f8215e23ec0610de67b84 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Fri, 30 Nov 2018 00:36:32 -0800
Subject: [PATCH 28/37] Enable nonzero pml alpha

---
 opencl_fdtd/simulation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 087e9c4..d24dcf2 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -160,6 +160,7 @@ class Simulation(object):
             pml.setdefault('mu_eff', 1.0)
             pml.setdefault('ln_R_per_layer', -1.6)
             pml.setdefault('m', 3.5)
+            pml.setdefault('cfs_alpha', 0)
             pml.setdefault('ma', 1)
 
         ctype = type_to_C(self.arg_type)
@@ -278,7 +279,7 @@ class Simulation(object):
 
             sigma_max = -pml['ln_R_per_layer'] / 2 * (pml['m'] + 1)
             kappa_max = numpy.sqrt(pml['mu_eff'] * pml['epsilon_eff'])
-            alpha_max = 0           # TODO: Nonzero alpha?
+            alpha_max = pml['cfs_alpha']
 
             def par(x):
                 scaling = ((x / (pml['thickness'])) ** pml['m'])

From 60b70bb332d29fb5a2aecb801517be659cf4daf9 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Fri, 30 Nov 2018 00:37:07 -0800
Subject: [PATCH 29/37] Add restrict keyword to pointers (not sharing the same
 memory for multiple fields)

---
 opencl_fdtd/simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index d24dcf2..1e573c9 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -166,7 +166,7 @@ class Simulation(object):
         ctype = type_to_C(self.arg_type)
 
         def ptr(arg: str) -> str:
-            return ctype + ' *' + arg
+            return ctype + ' * restrict ' + arg
 
         base_fields = OrderedDict()
         base_fields[ptr('E')] = self.E

From a76e741d325d26ff5968904ae12f4ed95effd313 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Fri, 30 Nov 2018 00:37:16 -0800
Subject: [PATCH 30/37] Minor spacing changes

---
 opencl_fdtd/simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 1e573c9..988e45b 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -321,7 +321,6 @@ class Simulation(object):
                                    arguments=', '.join(args.keys()))
         return lambda e: update(*args.values(), wait_for=e)
 
-
     def _create_context(self, context: pyopencl.Context = None,
                         queue: pyopencl.CommandQueue = None):
         if context is None:
@@ -353,6 +352,7 @@ class Simulation(object):
                 Exception('Initial field list elements must have same shape as epsilon elements')
             return pyopencl.array.to_device(self.queue, vec(initial_value).astype(self.arg_type))
 
+
 def type_to_C(float_type) -> str:
     """
     Returns a string corresponding to the C equivalent of a numpy type.

From 0f0f16e1846489ac1ab4c4126167c5b0b3fbb0fe Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Fri, 30 Nov 2018 01:03:10 -0800
Subject: [PATCH 31/37] Minor formatting changes

---
 opencl_fdtd/kernels/update_h.cl | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/opencl_fdtd/kernels/update_h.cl b/opencl_fdtd/kernels/update_h.cl
index ac75157..187a18a 100644
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@@ -52,10 +52,7 @@ if ({{r}} == s{{r}} - 1) {
     dE{{u ~ r}} = bloch_re * dE{{u ~ r}} + bloch_im * (F{{u}}[i + p{{u}}] - F{{u}}[i]);
     dE{{v ~ r}} = bloch_re * dE{{v ~ r}} + bloch_im * (F{{v}}[i + p{{v}}] - F{{v}}[i]);
 }
-{%- endfor %}
-
-
-
+{% endfor -%}
 
 {%- if do_poynting %}
 
@@ -93,7 +90,7 @@ ftype pHzi = 0;
         {%- set se, sh = '-', '+' -%}
     {%- else -%}
         {%- set se, sh = '+', '-' -%}
-    {%- endif -%}
+    {%- endif %}
 
 int pml_{{r ~ p}}_thickness = {{pml['thickness']}};
 

From 385e2859a9620cefe3536bd2eea9f5a051c2e9b2 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Fri, 30 Nov 2018 01:03:20 -0800
Subject: [PATCH 32/37] Update parameter descriptions

---
 opencl_fdtd/simulation.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 988e45b..9dd90c0 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -100,9 +100,13 @@ class Simulation(object):
             'ln_R_per_layer': Desired (ln(R) / thickness) value. Default -1.6.
             'm': Polynomial grading exponent. Default 3.5.
             'ma': Exponent for alpha. Default 1.
+        :param bloch_boundaries: List of dicts with keys:
+            'axis': One of 'x', 'y', 'z'.
+            'real': Real part of bloch phase factor (i.e. real(exp(i * phase)))
+            'imag': Imaginary part of bloch phase factor (i.e. imag(exp(i * phase)))
         :param dt: Time step. Default is min(dxes) * .99/sqrt(3).
-        :param initial_E: Initial E-field (default is 0 everywhere). Same format as epsilon.
-        :param initial_H: Initial H-field (default is 0 everywhere). Same format as epsilon.
+        :param initial_fields: Dict with optional keys ('E', 'H', 'F', 'G') containing initial values for the
+            specified fields (default is 0 everywhere). Fields have same format as epsilon.
         :param context: pyOpenCL context. If not given, pyopencl.create_some_context(False) is called.
         :param queue: pyOpenCL command queue. If not given, pyopencl.CommandQueue(context) is called.
         :param float_type: numpy.float32 or numpy.float64. Default numpy.float32.

From 582dafbc2fefce0bcc38b524a715e99f9c5ac89b Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Fri, 30 Nov 2018 01:04:00 -0800
Subject: [PATCH 33/37] Update example with bloch fields

---
 fdtd.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fdtd.py b/fdtd.py
index 08606c6..99af07b 100644
--- a/fdtd.py
+++ b/fdtd.py
@@ -128,7 +128,9 @@ def main():
     # #### Create the simulation grid ####
     pmls = [{'axis': a, 'polarity': p, 'thickness': pml_thickness}
             for a in 'xyz' for p in 'np']
-    sim = Simulation(grid.grids, do_poynting=True, pmls=pmls)
+    #bloch = [{'axis': a, 'real': 1, 'imag': 0} for a in 'x']
+    bloch = []
+    sim = Simulation(grid.grids, do_poynting=True, pmls=pmls, bloch_boundaries=bloch)
 
     # Source parameters and function
     w = 2 * numpy.pi * dx / wl
@@ -142,22 +144,32 @@ def main():
 
     with open('sources.c', 'w') as f:
         f.write(sim.sources['E'])
-        f.write('\n==========================================\n')
+        f.write('\n====================H======================\n')
         f.write(sim.sources['H'])
         if sim.update_S:
-            f.write('\n==========================================\n')
+            f.write('\n=====================S=====================\n')
             f.write(sim.sources['S'])
+        if bloch:
+            f.write('\n=====================F=====================\n')
+            f.write(sim.sources['F'])
+            f.write('\n=====================G=====================\n')
+            f.write(sim.sources['G'])
 
     # #### Run a bunch of iterations ####
     # event = sim.whatever([prev_event]) indicates that sim.whatever should be queued
     #  immediately and run once prev_event is finished.
     start = time.perf_counter()
     for t in range(max_t):
-        sim.update_E([]).wait()
+        e = sim.update_E([])
+        if bloch:
+            e = sim.update_F([e])
+        e.wait()
 
         ind = numpy.ravel_multi_index(tuple(grid.shape//2), dims=grid.shape, order='C') + numpy.prod(grid.shape)
         sim.E[ind] += field_source(t)
         e = sim.update_H([])
+        if bloch:
+            e = sim.update_G([e])
         if sim.update_S:
             e = sim.update_S([e])
         e.wait()

From a37df3a74f10837c2c0a65cc8ac412cd7aba26bf Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Sun, 14 Jul 2019 23:48:13 -0700
Subject: [PATCH 34/37] expand gitignores

---
 .gitignore | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3b93ce0..5298f42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,9 @@
 .idea/
-__pycache__
 *.h5
-*.pyc
+
+__pycache__
+*.py[cod]
+build/
+dist/
+*.egg-info/
 

From 314e36a3cc3b16fb3250a5a22b570ed84f4f13f7 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Sun, 14 Jul 2019 23:50:11 -0700
Subject: [PATCH 35/37] fix for 1-sized grids

---
 opencl_fdtd/kernels/common.cl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/opencl_fdtd/kernels/common.cl b/opencl_fdtd/kernels/common.cl
index deecec9..8c58d48 100644
--- a/opencl_fdtd/kernels/common.cl
+++ b/opencl_fdtd/kernels/common.cl
@@ -78,7 +78,8 @@ int p{{r}} = + (int) di{{r}};
 int wrap_{{r}} = (s{{r}} - 1) * (int) di{{r}};
 if ( {{r}} == 0 ) {
   m{{r}} = wrap_{{r}};
-} else if ( {{r}} == s{{r}} - 1 ) {
+}
+if ( {{r}} == s{{r}} - 1 ) {
   p{{r}} = -wrap_{{r}};
-} 
+}
 {% endfor %}

From fe2f4f45109a21935056de044e381792fc950001 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Sun, 14 Jul 2019 23:51:57 -0700
Subject: [PATCH 36/37] style fixes

---
 opencl_fdtd/kernels/common.cl   | 2 +-
 opencl_fdtd/kernels/update_j.cl | 4 ++--
 opencl_fdtd/simulation.py       | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/opencl_fdtd/kernels/common.cl b/opencl_fdtd/kernels/common.cl
index 8c58d48..769c248 100644
--- a/opencl_fdtd/kernels/common.cl
+++ b/opencl_fdtd/kernels/common.cl
@@ -2,7 +2,7 @@
 /* Common code for E, H updates
  *
  * Template parameters:
- *  ftype	type name (e.g. float or double)
+ *  ftype       type name (e.g. float or double)
  *  shape       list of 3 ints specifying shape of fields
  */
 #}
diff --git a/opencl_fdtd/kernels/update_j.cl b/opencl_fdtd/kernels/update_j.cl
index 4d7f64d..2682f39 100644
--- a/opencl_fdtd/kernels/update_j.cl
+++ b/opencl_fdtd/kernels/update_j.cl
@@ -1,11 +1,11 @@
 /*
- *  Update E-field from J field 
+ *  Update E-field from J field
  *
  *  Template parameters:
  *   common_header: Rendered contents of common.cl
  *
  *  OpenCL args:
- *   E, Jr, Ji, c, s, xmin, xmax, ymin, ymax, zmin, zmax   
+ *   E, Jr, Ji, c, s, xmin, xmax, ymin, ymax, zmin, zmax
  */
 
 {{common_header}}
diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 9dd90c0..03b5588 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -286,13 +286,13 @@ class Simulation(object):
             alpha_max = pml['cfs_alpha']
 
             def par(x):
-                scaling = ((x / (pml['thickness'])) ** pml['m'])
+                scaling = (x / pml['thickness']) ** pml['m']
                 sigma = scaling * sigma_max
                 kappa = 1 + scaling * (kappa_max - 1)
                 alpha = ((1 - x / pml['thickness']) ** pml['ma']) * alpha_max
                 p0 = numpy.exp(-(sigma / kappa + alpha) * self.dt)
                 p1 = sigma / (sigma + kappa * alpha) * (p0 - 1)
-                p2 = 1/kappa
+                p2 = 1 / kappa
                 return p0, p1, p2
 
             xe, xh = (numpy.arange(1, pml['thickness'] + 1, dtype=self.arg_type)[::-1] for _ in range(2))

From d5fd78d4933cb65f04d85eed7fca5e3264c934f8 Mon Sep 17 00:00:00 2001
From: Jan Petykiewicz <anewusername@gmail.com>
Date: Wed, 17 Jul 2019 00:53:55 -0700
Subject: [PATCH 37/37] Revert "Add restrict keyword to pointers (not sharing
 the same memory for multiple fields)"

This reverts commit 60b70bb332d29fb5a2aecb801517be659cf4daf9.

It appears to have minimal performance impact, and fails to compile on
nvidia cards.
---
 opencl_fdtd/simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencl_fdtd/simulation.py b/opencl_fdtd/simulation.py
index 03b5588..c5d0005 100644
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@@ -170,7 +170,7 @@ class Simulation(object):
         ctype = type_to_C(self.arg_type)
 
         def ptr(arg: str) -> str:
-            return ctype + ' * restrict ' + arg
+            return ctype + ' *' + arg
 
         base_fields = OrderedDict()
         base_fields[ptr('E')] = self.E