use arrays for pml params

2016-09-01 14:39:44 -07:00
15 changed files with 508 additions and 906 deletions
--- a/README.md
+++ b/README.md
@ -5,38 +5,32 @@ electromagnetic simulations on parallel compute hardware (mainly GPUs).
 **Performance** highly depends on what hardware you have available:
 * A 395x345x73 cell simulation (~10 million points, 8-cell absorbing boundaries)
- runs at around 91 iterations/sec. on my AMD RX480.
+ runs at around 42 iterations/sec. on my Nvidia GTX 580.
-* On an Nvidia GTX 580, it runs at 66 iterations/sec
+* On my laptop (Nvidia 940M) the same simulation achieves ~8 iterations/sec.
 * On my laptop (Nvidia 940M) the same simulation achieves ~12 iterations/sec.
 * An L3 photonic crystal cavity ringdown simulation (1550nm source, 40nm
- discretization, 8000 steps) takes about 3 minutes on my laptop.
+ discretization, 8000 steps) takes about 5 minutes on my laptop.
 **Capabilities** are currently pretty minimal:
 * Absorbing boundaries (CPML)
-* Perfect electrical conductors (PECs; to use set epsilon to inf)
+* Conducting boundaries (PMC)
 * Anisotropic media (eps_xx, eps_yy, eps_zz, mu_xx, ...)
 * Direct access to fields (eg., you can trivially add a soft or hard
- current source with just sim.E[ind] += sin(f0 * t), or save any portion
+ current source with just sim.E[1] += sin(f0 * t), or save any portion
 of a field to a file)
 ## Installation
 **Requirements:**
 * python 3 (written and tested with 3.5)
 * numpy
 * pyopencl
-* jinja2
+* h5py (for file output)
-* [fdfd_tools](https://mpxd.net/code/jan/fdfd_tools)
+* [gridlock](https://mpxd.net/gogs/jan/gridlock)
-
+* [masque](https://mpxd.net/gogs/jan/masque)
 Optional (used for examples):
 * dill (for file output)
 * [gridlock](https://mpxd.net/code/jan/gridlock)
 * [masque](https://mpxd.net/code/jan/masque)
 To get the code, just clone this repository:
 ```bash
-git clone https://mpxd.net/code/jan/opencl_fdtd.git
+git clone https://mpxd.net/gogs/jan/opencl_fdtd.git
 ```
 You can install the requirements and their dependencies easily with
--- a/fdtd.py
+++ b/fdtd.py
@ -6,23 +6,14 @@ See main() for simulation setup.
 import sys
 import time
 import logging
 import numpy
-import lzma
+import h5py
 import dill
-from opencl_fdtd import Simulation
+from fdtd.simulation import Simulation
 from masque import Pattern, shapes
 import gridlock
 import pcgen
 import fdfd_tools
 __author__ = 'Jan Petykiewicz'
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 def perturbed_l3(a: float, radius: float, **kwargs) -> Pattern:
@ -84,7 +75,7 @@ def perturbed_l3(a: float, radius: float, **kwargs) -> Pattern:
 def main():
    max_t = 8000            # number of timesteps
-    dx = 25                 # discretization (nm/cell)
+    dx = 40                 # discretization (nm/cell)
    pml_thickness = 8       # (number of cells)
    wl = 1550               # Excitation wavelength and fwhm
@ -105,8 +96,7 @@ def main():
    z_max = 1.6 * a
    xyz_max = numpy.hstack((xy_max, z_max)) + pml_thickness * dx
-    # Coordinates of the edges of the cells.
+    # Coordinates of the edges of the cells. The fdtd package can only do square grids at the moment.
    #  The fdtd package can only do square grids at the moment.
    half_edge_coords = [numpy.arange(dx/2, m + dx, step=dx) for m in xyz_max]
    edge_coords = [numpy.hstack((-h[::-1], h)) for h in half_edge_coords]
@ -124,13 +114,19 @@ def main():
                       eps=n_air**2,
                       polygons=mask.as_polygons())
-    logger.info('grid shape: {}'.format(grid.shape))
+    print(grid.shape)
    # #### Create the simulation grid ####
-    pmls = [{'axis': a, 'polarity': p, 'thickness': pml_thickness}
+    sim = Simulation(grid.grids)
-            for a in 'xyz' for p in 'np']
+
-    #bloch = [{'axis': a, 'real': 1, 'imag': 0} for a in 'x']
+    # Conducting boundaries and pmls in every direction
-    bloch = []
+    c_args = []
-    sim = Simulation(grid.grids, do_poynting=True, pmls=pmls, bloch_boundaries=bloch)
+    pml_args = []
    for d in (0, 1, 2):
        for p in (-1, 1):
            c_args += [{'direction': d, 'polarity': p}]
            pml_args += [{'direction': d, 'polarity': p, 'epsilon_eff': n_slab**2}]
    sim.init_conductors(c_args)
    sim.init_cpml(pml_args)
    # Source parameters and function
    w = 2 * numpy.pi * dx / wl
@ -142,54 +138,34 @@ def main():
        t0 = i * sim.dt - delay
        return numpy.sin(w * t0) * numpy.exp(-alpha * t0**2)
    with open('sources.c', 'w') as f:
        f.write(sim.sources['E'])
        f.write('\n====================H======================\n')
        f.write(sim.sources['H'])
        if sim.update_S:
            f.write('\n=====================S=====================\n')
            f.write(sim.sources['S'])
        if bloch:
            f.write('\n=====================F=====================\n')
            f.write(sim.sources['F'])
            f.write('\n=====================G=====================\n')
            f.write(sim.sources['G'])
    # #### Run a bunch of iterations ####
-    # event = sim.whatever([prev_event]) indicates that sim.whatever should be queued
+    # event = sim.whatever([prev_event]) indicates that sim.whatever should be queued immediately and run
-    #  immediately and run once prev_event is finished.
+    #  once prev_event is finished.
    output_file = h5py.File('simulation_output.h5', 'w')
    start = time.perf_counter()
    for t in range(max_t):
-        e = sim.update_E([])
+        event = sim.cpml_E([])
-        if bloch:
+        sim.update_E([event]).wait()
            e = sim.update_F([e])
        e.wait()
-        ind = numpy.ravel_multi_index(tuple(grid.shape//2), dims=grid.shape, order='C') + numpy.prod(grid.shape)
+        sim.E[1][tuple(grid.shape//2)] += field_source(t)
-        sim.E[ind] += field_source(t)
+        event = sim.conductor_E([])
-        e = sim.update_H([])
+        event = sim.cpml_H([event])
-        if bloch:
+        event = sim.update_H([event])
-            e = sim.update_G([e])
+        sim.conductor_H([event]).wait()
        if sim.update_S:
            e = sim.update_S([e])
        e.wait()
-        if t % 100 == 0:
+        print('iteration {}: average {} iterations per sec'.format(t, (t+1)/(time.perf_counter()-start)))
-            logger.info('iteration {}: average {} iterations per sec'.format(t, (t+1)/(time.perf_counter()-start)))
+        sys.stdout.flush()
            sys.stdout.flush()
-    with lzma.open('saved_simulation', 'wb') as f:
+        if t % 20 == 0:
-        def unvec(f):
+            r = sum([(f * f * e).get().sum() for f, e in zip(sim.E, sim.eps)])
-            return fdfd_tools.unvec(f, grid.shape)
+            print('E sum', r)
        d = {
            'grid': grid,
            'E': unvec(sim.E.get()),
            'H': unvec(sim.H.get()),
            }
        if sim.S is not None:
            d['S'] = unvec(sim.S.get())
        dill.dump(d, f)
        # Save field slices
        if (t % 20 == 0 and (max_t - t <= 1000 or t <= 2000)) or t == max_t-1:
            print('saving E-field')
            for j, f in enumerate(sim.E):
                a = f.get()
                output_file['/E{}_t{}'.format('xyz'[j], t)] = a[:, :, round(a.shape[2]/2)]
 if __name__ == '__main__':
    main()
--- a/fdtd/init.py
+++ b/fdtd/init.py
--- a/fdtd/base.py
+++ b/fdtd/base.py
@ -0,0 +1,72 @@
 """
 Basic code snippets for opencl FDTD
 """
 from typing import List
 import numpy
 def shape_source(shape: List[int] or numpy.ndarray) -> str:
    """
    Defines sx, sy, sz C constants specifying the shape of the grid in each of the 3 dimensions.
    :param shape: [sx, sy, sz] values.
    :return: String containing C source.
    """
    sxyz = """
 // Field sizes
 const int sx = {shape[0]};
 const int sy = {shape[1]};
 const int sz = {shape[2]};
 """.format(shape=shape)
    return sxyz
 # Defines dix, diy, diz constants used for stepping in the x, y, z directions in a linear array
 #  (ie, given Ex[i] referring to position (x, y, z), Ex[i+diy] will refer to position (x, y+1, z))
 dixyz_source = """
 // Convert offset in field xyz to linear index offset
 const int dix = sz * sy;
 const int diy = sz;
 const int diz = 1;
 """
 # Given a linear index i and shape sx, sy, sz, defines x, y, and z as the 3D indices of the current element (i).
 xyz_source = """
 // Convert linear index to field index (xyz)
 const int x = i / (sz * sy);
 const int y = (i - x * sz * sy) / sz;
 const int z = (i - y * sz - x * sz * sy);
 """
 # Source code for updating the E field; maxes use of dixyz_source.
 maxwell_E_source = """
 // E update equations
 Ex[i] += dt / epsx[i] * ((Hz[i] - Hz[i-diy]) - (Hy[i] - Hy[i-diz]));
 Ey[i] += dt / epsy[i] * ((Hx[i] - Hx[i-diz]) - (Hz[i] - Hz[i-dix]));
 Ez[i] += dt / epsz[i] * ((Hy[i] - Hy[i-dix]) - (Hx[i] - Hx[i-diy]));
 """
 # Source code for updating the H field; maxes use of dixyz_source and assumes mu=0
 maxwell_H_source = """
 // H update equations
 Hx[i] -= dt * ((Ez[i+diy] - Ez[i]) - (Ey[i+diz] - Ey[i]));
 Hy[i] -= dt * ((Ex[i+diz] - Ex[i]) - (Ez[i+dix] - Ez[i]));
 Hz[i] -= dt * ((Ey[i+dix] - Ey[i]) - (Ex[i+diy] - Ex[i]));
 """
 def type_to_C(float_type: numpy.float32 or numpy.float64) -> str:
    """
    Returns a string corresponding to the C equivalent of a numpy type.
    Only works for float32 and float64.
    :param float_type: numpy.float32 or numpy.float64
    :return: string containing the corresponding C type (eg. 'double')
    """
    if float_type == numpy.float32:
        arg_type = 'float'
    elif float_type == numpy.float64:
        arg_type = 'double'
    else:
        raise Exception('Unsupported type')
    return arg_type
--- a/fdtd/boundary.py
+++ b/fdtd/boundary.py
@ -0,0 +1,173 @@
 from typing import List, Dict
 import numpy
 def conductor(direction: int,
              polarity: int,
              ) -> List[str]:
    """
    Create source code for conducting boundary conditions.
    :param direction: integer in range(3), corresponding to x,y,z.
    :param polarity: -1 or 1, specifying eg. a -x or +x boundary.
    :return: [E_source, H_source] source code for E and H boundary update steps.
    """
    if direction not in range(3):
        raise Exception('Invalid direction: {}'.format(direction))
    if polarity not in (-1, 1):
        raise Exception('Invalid polarity: {}'.format(polarity))
    r = 'xyz'[direction]
    uv = 'xyz'.replace(r, '')
    if polarity < 0:
        bc_e = """
 if ({r} == 0) {{
    E{r}[i] = 0;
    E{u}[i] = E{u}[i+di{r}];
    E{v}[i] = E{v}[i+di{r}];
 }}
 """
        bc_h = """
 if ({r} == 0) {{
    H{r}[i] = H{r}[i+di{r}];
    H{u}[i] = 0;
    H{v}[i] = 0;
 }}
 """
    elif polarity > 0:
        bc_e = """
 if ({r} == s{r} - 1) {{
    E{r}[i] = -E{r}[i-2*di{r}];
    E{u}[i] = +E{u}[i-di{r}];
    E{v}[i] = +E{v}[i-di{r}];
 }} else if ({r} == s{r} - 2) {{
    E{r}[i] = 0;
 }}
 """
        bc_h = """
 if ({r} == s{r} - 1) {{
    H{r}[i] = +H{r}[i-di{r}];
    H{u}[i] = -H{u}[i-2*di{r}];
    H{v}[i] = -H{v}[i-2*di{r}];
 }} else if ({r} == s{r} - 2) {{
    H{u}[i] = 0;
    H{v}[i] = 0;
 }}
 """
    else:
        raise Exception()
    replacements = {'r': r, 'u': uv[0], 'v': uv[1]}
    return [s.format(**replacements) for s in (bc_e, bc_h)]
 def cpml(direction: int,
         polarity: int,
         dt: float,
         thickness: int=8,
         epsilon_eff: float=1,
         ) -> Dict:
    """
    Generate source code for complex phase matched layer (cpml) absorbing boundaries.
    These are not full boundary conditions and require a conducting boundary to be added
     in the same direction as the pml.
    :param direction: integer in range(3), corresponding to x, y, z directions.
    :param polarity: -1 or 1, corresponding to eg. -x or +x direction.
    :param dt: timestep used by the simulation
    :param thickness: Number of cells used by the pml (the grid is NOT expanded to add these cells). Default 8.
    :param epsilon_eff: Effective epsilon_r of the pml layer. Default 1.
    :return: Dict with entries 'E', 'H' (update equations for E and H) and 'psi_E', 'psi_H' (lists of str,
            specifying the field names of the cpml fields used in the E and H update steps. Eg.,
            Psi_xn_Ex for the complex Ex component for the x- pml.)
    """
    if direction not in range(3):
        raise Exception('Invalid direction: {}'.format(direction))
    if polarity not in (-1, 1):
        raise Exception('Invalid polarity: {}'.format(polarity))
    if thickness <= 2:
        raise Exception('It would be wise to have a pml with 4+ cells of thickness')
    if epsilon_eff <= 0:
        raise Exception('epsilon_eff must be positive')
    m = (3.5, 1)
    sigma_max = 0.8 * (m[0] + 1) / numpy.sqrt(epsilon_eff)
    alpha_max = 0  # TODO: Decide what to do about non-zero alpha
    transverse = numpy.delete(range(3), direction)
    r = 'xyz'[direction]
    np = 'nVp'[numpy.sign(polarity)+1]
    uv = ['xyz'[i] for i in transverse]
    xe = numpy.arange(1, thickness+1, dtype=float)[::-1]
    xh = numpy.arange(1, thickness+1, dtype=float)[::-1]
    if polarity > 0:
        xe -= 0.5
    elif polarity < 0:
        xh -= 0.5
    def par(x):
        sigma = ((x / thickness) ** m[0]) * sigma_max
        alpha = ((1 - x / thickness) ** m[1]) * alpha_max
        p0 = numpy.exp(-(sigma + alpha) * dt)
        p1 = sigma / (sigma + alpha) * (p0 - 1)
        return p0, p1
    vals = {'r': r,
            'u': uv[0],
            'v': uv[1],
            'np':   np,
            'th':   thickness,
            'se': '-+'[direction % 2],
            'sh': '+-'[direction % 2]}
    if polarity < 0:
        bounds_if = """
 if ( 0 < {r} && {r} < {th} + 1 ) {{
    const int ir = {r} - 1;                             // index into pml parameters
    const int ip = {v} + {u} * s{v} + ir * s{v} * s{u};   // linear index into Psi
 """
    elif polarity > 0:
        bounds_if = """
 if ( (s{r} - 1) > {r} && {r} > (s{r} - 1) - ({th} + 1) ) {{
    const int ir = (s{r} - 1) - ({r} + 1);              // index into pml parameters
    const int ip = {v} + {u} * s{v} + ir * s{v} * s{u};   // linear index into Psi
 """
    else:
        raise Exception('Bad polarity (=0)')
    code_e = """
    Psi_{r}{np}_E{u}[ip] = p0e[ir] * Psi_{r}{np}_E{u}[ip] + p1e[ir] * (H{v}[i] - H{v}[i-di{r}]);
    Psi_{r}{np}_E{v}[ip] = p0e[ir] * Psi_{r}{np}_E{v}[ip] + p1e[ir] * (H{u}[i] - H{u}[i-di{r}]);
    E{u}[i] {se}= dt / eps{u}[i] * Psi_{r}{np}_E{u}[ip];
    E{v}[i] {sh}= dt / eps{v}[i] * Psi_{r}{np}_E{v}[ip];
 }}
 """
    code_h = """
    Psi_{r}{np}_H{u}[ip] = p0h[ir] * Psi_{r}{np}_H{u}[ip] + p1h[ir] * (E{v}[i+di{r}] - E{v}[i]);
    Psi_{r}{np}_H{v}[ip] = p0h[ir] * Psi_{r}{np}_H{v}[ip] + p1h[ir] * (E{u}[i+di{r}] - E{u}[i]);
    H{u}[i] {sh}= dt * Psi_{r}{np}_H{u}[ip];
    H{v}[i] {se}= dt * Psi_{r}{np}_H{v}[ip];
 }}
 """
    pml_data = {
        'E': (bounds_if + code_e).format(**vals),
        'H': (bounds_if + code_h).format(**vals),
        'psi_E': ['Psi_{r}{np}_E{u}'.format(**vals),
                  'Psi_{r}{np}_E{v}'.format(**vals)],
        'psi_H': ['Psi_{r}{np}_H{u}'.format(**vals),
                  'Psi_{r}{np}_H{v}'.format(**vals)],
        'pe': par(xe),
        'ph': par(xh),
    }
    return pml_data
--- a/fdtd/simulation.py
+++ b/fdtd/simulation.py
@ -0,0 +1,214 @@
 """
 Class for constructing and holding the basic FDTD operations and fields
 """
 from typing import List, Dict, Callable
 import numpy
 import warnings
 import pyopencl
 import pyopencl.array
 from pyopencl.elementwise import ElementwiseKernel
 from . import boundary, base
 from .base import type_to_C
 class Simulation(object):
    """
    Constructs and holds the basic FDTD operations and related fields
    """
    E = None    # type: List[pyopencl.array.Array]
    H = None    # type: List[pyopencl.array.Array]
    eps = None  # type: List[pyopencl.array.Array]
    dt = None   # type: float
    arg_type = None     # type: numpy.float32 or numpy.float64
    context = None      # type: pyopencl.Context
    queue = None        # type: pyopencl.CommandQueue
    update_E = None     # type: Callable[[],pyopencl.Event]
    update_H = None     # type: Callable[[],pyopencl.Event]
    conductor_E = None  # type: Callable[[],pyopencl.Event]
    conductor_H = None  # type: Callable[[],pyopencl.Event]
    cpml_E = None       # type: Callable[[],pyopencl.Event]
    cpml_H = None       # type: Callable[[],pyopencl.Event]
    cpml_psi_E = None   # type: Dict[str, pyopencl.array.Array]
    cpml_psi_H = None   # type: Dict[str, pyopencl.array.Array]
    def __init__(self,
                 epsilon: List[numpy.ndarray],
                 dt: float=.99/numpy.sqrt(3),
                 initial_E: List[numpy.ndarray]=None,
                 initial_H: List[numpy.ndarray]=None,
                 context: pyopencl.Context=None,
                 queue: pyopencl.CommandQueue=None,
                 float_type: numpy.float32 or numpy.float64=numpy.float32):
        """
        Initialize the simulation.
        :param epsilon: List containing [eps_r,xx, eps_r,yy, eps_r,zz], where each element is a Yee-shifted ndarray
                spanning the simulation domain. Relative epsilon is used.
        :param dt: Time step. Default is the Courant factor.
        :param initial_E: Initial E-field (default is 0 everywhere). Same format as epsilon.
        :param initial_H: Initial H-field (default is 0 everywhere). Same format as epsilon.
        :param context: pyOpenCL context. If not given, pyopencl.create_some_context(False) is called.
        :param queue: pyOpenCL command queue. If not given, pyopencl.CommandQueue(context) is called.
        :param float_type: numpy.float32 or numpy.float64. Default numpy.float32.
        """
        if len(epsilon) != 3:
            Exception('Epsilon must be a list with length of 3')
        if not all((e.shape == epsilon[0].shape for e in epsilon[1:])):
            Exception('All epsilon grids must have the same shape. Shapes are {}', [e.shape for e in epsilon])
        if context is None:
            self.context = pyopencl.create_some_context(False)
        else:
            self.context = context
        if queue is None:
            self.queue = pyopencl.CommandQueue(self.context)
        else:
            self.queue = queue
        if dt > .99/numpy.sqrt(3):
            warnings.warn('Warning: unstable dt: {}'.format(dt))
        elif dt <= 0:
            raise Exception('Invalid dt: {}'.format(dt))
        else:
            self.dt = dt
        self.arg_type = float_type
        self.eps = [pyopencl.array.to_device(self.queue, e.astype(float_type)) for e in epsilon]
        if initial_E is None:
            self.E = [pyopencl.array.zeros_like(self.eps[0]) for _ in range(3)]
        else:
            if len(initial_E) != 3:
                Exception('Initial_E must be a list of length 3')
            if not all((E.shape == epsilon[0].shape for E in initial_E)):
                Exception('Initial_E list elements must have same shape as epsilon elements')
            self.E = [pyopencl.array.to_device(self.queue, E.astype(float_type)) for E in initial_E]
        if initial_H is None:
            self.H = [pyopencl.array.zeros_like(self.eps[0]) for _ in range(3)]
        else:
            if len(initial_H) != 3:
                Exception('Initial_H must be a list of length 3')
            if not all((H.shape == epsilon[0].shape for H in initial_H)):
                Exception('Initial_H list elements must have same shape as epsilon elements')
            self.H = [pyopencl.array.to_device(self.queue, H.astype(float_type)) for H in initial_H]
        ctype = type_to_C(self.arg_type)
        E_args = [ctype + ' *E' + c for c in 'xyz']
        H_args = [ctype + ' *H' + c for c in 'xyz']
        eps_args = [ctype + ' *eps' + c for c in 'xyz']
        dt_arg = [ctype + ' dt']
        sxyz = base.shape_source(epsilon[0].shape)
        E_source = sxyz + base.dixyz_source + base.maxwell_E_source
        H_source = sxyz + base.dixyz_source + base.maxwell_H_source
        E_update = ElementwiseKernel(self.context, operation=E_source,
                                     arguments=', '.join(E_args + H_args + dt_arg + eps_args))
        H_update = ElementwiseKernel(self.context, operation=H_source,
                                     arguments=', '.join(E_args + H_args + dt_arg))
        self.update_E = lambda e: E_update(*self.E, *self.H, self.dt, *self.eps, wait_for=e)
        self.update_H = lambda e: H_update(*self.E, *self.H, self.dt, wait_for=e)
    def init_cpml(self, pml_args: List[Dict]):
        """
        Initialize absorbing layers (cpml: complex phase matched layer). PMLs are not actual
         boundary conditions, so you should add a conducting boundary (.init_conductors()) for
         all directions in which you add PMLs.
        Allows use of self.cpml_E(events) and self.cpml_H(events).
        All necessary additional fields are created on the opencl device.
        :param pml_args: A list containing dictionaries which are passed to .boundary.cpml(...).
            The dt argument is set automatically, but the others must be passed in each entry
             of pml_args.
        """
        sxyz = base.shape_source(self.eps[0].shape)
        # Prepare per-iteration constants for later use
        pml_E_source = sxyz + base.dixyz_source + base.xyz_source
        pml_H_source = sxyz + base.dixyz_source + base.xyz_source
        psi_E = []
        psi_H = []
        psi_E_names = []
        psi_H_names = []
        for arg_set in pml_args:
            pml_data = boundary.cpml(dt=self.dt, **arg_set)
            pml_E_source += pml_data['E']
            pml_H_source += pml_data['H']
            ti = numpy.delete(range(3), arg_set['direction'])
            trans = [self.eps[0].shape[i] for i in ti]
            psi_shape = (8, trans[0], trans[1])
            psi_E += [pyopencl.array.zeros(self.queue, psi_shape, dtype=self.arg_type)
                      for _ in pml_data['psi_E']]
            psi_H += [pyopencl.array.zeros(self.queue, psi_shape, dtype=self.arg_type)
                      for _ in pml_data['psi_H']]
            psi_E_names += pml_data['psi_E']
            psi_H_names += pml_data['psi_H']
        ctype = type_to_C(self.arg_type)
        E_args = [ctype + ' *E' + c for c in 'xyz']
        H_args = [ctype + ' *H' + c for c in 'xyz']
        eps_args = [ctype + ' *eps' + c for c in 'xyz']
        dt_arg = [ctype + ' dt']
        arglist_E = [ctype + ' *' + psi for psi in psi_E_names]
        arglist_H = [ctype + ' *' + psi for psi in psi_H_names]
        pe_args = [ctype + ' *' + s for s in ('p0e', 'p1e')]
        ph_args = [ctype + ' *' + s for s in ('p0h', 'p1h')]
        pml_E_args = ', '.join(E_args + H_args + dt_arg + eps_args + arglist_E + pe_args)
        pml_H_args = ', '.join(E_args + H_args + dt_arg + arglist_H + ph_args)
        pml_E = ElementwiseKernel(self.context, arguments=pml_E_args, operation=pml_E_source)
        pml_H = ElementwiseKernel(self.context, arguments=pml_H_args, operation=pml_H_source)
        pe = [pyopencl.array.to_device(self.queue, p) for p in pml_data['pe']]
        ph = [pyopencl.array.to_device(self.queue, p) for p in pml_data['ph']]
        self.cpml_E = lambda e: pml_E(*self.E, *self.H, self.dt, *self.eps, *psi_E, *pe, wait_for=e)
        self.cpml_H = lambda e: pml_H(*self.E, *self.H, self.dt, *psi_H, *ph, wait_for=e)
        self.cmpl_psi_E = {k: v for k, v in zip(psi_E_names, psi_E)}
        self.cmpl_psi_H = {k: v for k, v in zip(psi_H_names, psi_H)}
    def init_conductors(self, conductor_args: List[Dict]):
        """
        Initialize reflecting boundary conditions.
        Allows use of self.conductor_E(events) and self.conductor_H(events).
        :param conductor_args: List of dictionaries with which to call .boundary.conductor(...).
        """
        sxyz = base.shape_source(self.eps[0].shape)
        # Prepare per-iteration constants
        bc_E_source = sxyz + base.dixyz_source + base.xyz_source
        bc_H_source = sxyz + base.dixyz_source + base.xyz_source
        for arg_set in conductor_args:
            [e, h] = boundary.conductor(**arg_set)
            bc_E_source += e
            bc_H_source += h
        E_args = [type_to_C(self.arg_type) + ' *E' + c for c in 'xyz']
        H_args = [type_to_C(self.arg_type) + ' *H' + c for c in 'xyz']
        bc_E = ElementwiseKernel(self.context, arguments=E_args, operation=bc_E_source)
        bc_H = ElementwiseKernel(self.context, arguments=H_args, operation=bc_H_source)
        self.conductor_E = lambda e: bc_E(*self.E, wait_for=e)
        self.conductor_H = lambda e: bc_H(*self.H, wait_for=e)
--- a/opencl_fdtd/init.py
+++ b/opencl_fdtd/init.py
@ -1,5 +0,0 @@
 from .simulation import Simulation, type_to_C
 __author__ = 'Jan Petykiewicz'
 version = '0.4'
--- a/opencl_fdtd/kernels/common.cl
+++ b/opencl_fdtd/kernels/common.cl
@ -1,84 +0,0 @@
 {#
 /* Common code for E, H updates
 *
 * Template parameters:
 *  ftype	type name (e.g. float or double)
 *  shape       list of 3 ints specifying shape of fields
 */
 #}
 typedef {{ftype}} ftype;
 /*
 * Field size info
 */
 const size_t sx = {{shape[0]}};
 const size_t sy = {{shape[1]}};
 const size_t sz = {{shape[2]}};
 const size_t field_size = sx * sy * sz;
 //Since we use i to index into Ex[], Ey[], ... rather than E[], do nothing if
 // i is outside the bounds of Ex[].
 if (i >= field_size) {
    PYOPENCL_ELWISE_CONTINUE;
 }
 /*
 * Array indexing
 */
 // Given a linear index i and shape (sx, sy, sz), defines x, y, and z
 //  as the 3D indices of the current element (i).
 // (ie, converts linear index [i] to field indices (x, y, z)
 const size_t x = i / (sz * sy);
 const size_t y = (i - x * sz * sy) / sz;
 const size_t z = (i - y * sz - x * sz * sy);
 // Calculate linear index offsets corresponding to offsets in 3D
 // (ie, if E[i] <-> E(x, y, z), then E[i + diy] <-> E(x, y + 1, z)
 const size_t dix = sz * sy;
 const size_t diy = sz;
 const size_t diz = 1;
 /*
 * Pointer math
 */
 //Pointer offsets into the components of a linearized vector-field
 // (eg. Hx = H + XX, where H and Hx are pointers)
 const size_t XX = 0;
 const size_t YY = field_size;
 const size_t ZZ = field_size * 2;
 //Define pointers to vector components of each field (eg. Hx = H + XX)
 __global ftype *Ex = E + XX;
 __global ftype *Ey = E + YY;
 __global ftype *Ez = E + ZZ;
 __global ftype *Hx = H + XX;
 __global ftype *Hy = H + YY;
 __global ftype *Hz = H + ZZ;
 /*
 * Implement periodic boundary conditions
 *
 * mx ([m]inus [x]) gives the index offset of the adjacent cell in the minus-x direction.
 * In the event that we start at x == 0, we actually want to wrap around and grab the cell
 * x_{-1} == (sx - 1) instead, ie. mx = (sx - 1) * dix .
 *
 * px ([p]lus [x]) gives the index offset of the adjacent cell in the plus-x direction.
 * In the event that we start at x == (sx - 1), we actually want to wrap around and grab
 * the cell x_{+1} == 0 instead, ie. px = -(sx - 1) * dix .
 */
 {% for r in 'xyz' %}
 int m{{r}} = - (int) di{{r}};
 int p{{r}} = + (int) di{{r}};
 int wrap_{{r}} = (s{{r}} - 1) * (int) di{{r}};
 if ( {{r}} == 0 ) {
  m{{r}} = wrap_{{r}};
 } else if ( {{r}} == s{{r}} - 1 ) {
  p{{r}} = -wrap_{{r}};
 } 
 {% endfor %}
--- a/opencl_fdtd/kernels/update_e.cl
+++ b/opencl_fdtd/kernels/update_e.cl
@ -1,107 +0,0 @@
 /*
 *  Update E-field, including any PMLs.
 *
 *  Template parameters:
 *   common_header: Rendered contents of common.cl
 *   pmls: [{'axis': 'x', 'polarity': 'n', 'thickness': 8}, ...] list of pml dicts containing
 *      axes, polarities, and thicknesses.
 *   uniform_dx: If grid is uniform, uniform_dx should be the grid spacing.
 *      Otherwise, uniform_dx should be False and [inv_dh{xyz}] arrays must be supplied as
 *      OpenCL args.
 *
 *  OpenCL args:
 *   E, H, dt, eps, [p{012}e{np}, Psi_{xyz}{np}_E], [inv_dh{xyz}]
 */
 {{common_header}}
 ////////////////////////////////////////////////////////////////////////////
 __global ftype *epsx = eps + XX;
 __global ftype *epsy = eps + YY;
 __global ftype *epsz = eps + ZZ;
 {%- if uniform_dx %}
 ftype inv_dx = 1.0 / {{uniform_dx}};
 ftype inv_dy = 1.0 / {{uniform_dx}};
 ftype inv_dz = 1.0 / {{uniform_dx}};
 {%- else %}
 ftype inv_dx = inv_dhx[x];
 ftype inv_dy = inv_dhy[y];
 ftype inv_dz = inv_dhz[z];
 {%- endif %}
 /*
 *   Precalculate derivatives
 */
 ftype dHxy = (Hx[i] - Hx[i + my]) * inv_dy;
 ftype dHxz = (Hx[i] - Hx[i + mz]) * inv_dz;
 ftype dHyx = (Hy[i] - Hy[i + mx]) * inv_dx;
 ftype dHyz = (Hy[i] - Hy[i + mz]) * inv_dz;
 ftype dHzx = (Hz[i] - Hz[i + mx]) * inv_dx;
 ftype dHzy = (Hz[i] - Hz[i + my]) * inv_dy;
 {% for bloch in bloch_boundaries -%}
    {%- set r = bloch['axis'] -%}
    {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
 if ({{r}} == 0) {
    ftype bloch_im = {{bloch['real']}};
    ftype bloch_re = {{bloch['imag']}};
    dH{{u ~ r}} = bloch_re * dH{{v ~ r}} + bloch_im * (G{{u}}[i] - G{{u}}[i + m{{u}}]);
    dH{{v ~ r}} = bloch_re * dH{{v ~ r}} + bloch_im * (G{{v}}[i] - G{{v}}[i + m{{v}}]);
 }
 {%- endfor %}
 /*
 *   PML Update
 */
 // PML effects on E
 ftype pExi = 0;
 ftype pEyi = 0;
 ftype pEzi = 0;
 {% for pml in pmls -%}
    {%- set r = pml['axis'] -%}
    {%- set p = pml['polarity'] -%}
    {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
    {%- set psi = 'Psi_' ~ r ~ p ~ '_E' -%}
    {%- if r != 'y' -%}
        {%- set se, sh = '-', '+' -%}
    {%- else -%}
        {%- set se, sh = '+', '-' -%}
    {%- endif -%}
 int pml_{{r ~ p}}_thickness = {{pml['thickness']}};
    {%- if p == 'n' %}
 if ( {{r}} < pml_{{r ~ p}}_thickness ) {
    const size_t ir = {{r}};                          // index into pml parameters
    {%- elif p == 'p' %}
 if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}}_thickness ) {
    const size_t ir = (s{{r}} - 1) - {{r}};                     // index into pml parameters
    {%- endif %}
    const size_t ip = {{v}} + {{u}} * s{{v}} + ir * s{{v}} * s{{u}};  // linear index into Psi
    dH{{v ~ r}} *= p{{r}}2e{{p}}[ir];
    dH{{u ~ r}} *= p{{r}}2e{{p}}[ir];
    {{psi ~ u}}[ip] = p{{r}}0e{{p}}[ir] * {{psi ~ u}}[ip] + p{{r}}1e{{p}}[ir] * dH{{v ~ r}};
    {{psi ~ v}}[ip] = p{{r}}0e{{p}}[ir] * {{psi ~ v}}[ip] + p{{r}}1e{{p}}[ir] * dH{{u ~ r}};
    pE{{u}}i {{se}}= {{psi ~ u}}[ip];
    pE{{v}}i {{sh}}= {{psi ~ v}}[ip];
 }
 {%- endfor %}
 /*
 *  Update E
 */
 Ex[i] += dt / epsx[i] * (dHzy - dHyz + pExi);
 Ey[i] += dt / epsy[i] * (dHxz - dHzx + pEyi);
 Ez[i] += dt / epsz[i] * (dHyx - dHxy + pEzi);
--- a/opencl_fdtd/kernels/update_h.cl
+++ b/opencl_fdtd/kernels/update_h.cl
@ -1,155 +0,0 @@
 /*
 *  Update H-field, including any PMLs.
 *   Also precalculate values for poynting vector if necessary.
 *
 *  Template parameters:
 *   common_header: Rendered contents of common.cl
 *   pmls: [{'axis': 'x', 'polarity': 'n', 'thickness': 8}, ...] list of pml dicts containing
 *      axes, polarities, and thicknesses.
 *   do_poynting: Whether to precalculate poynting vector components (boolean)
 *   uniform_dx: If grid is uniform, uniform_dx should be the grid spacing.
 *      Otherwise, uniform_dx should be False and [inv_de{xyz}] arrays must be supplied as
 *      OpenCL args.
 *
 *  OpenCL args:
 *   E, H, dt, [inv_de{xyz}], [p{xyz}{01}h{np}, Psi_{xyz}{np}_H], [oS]
 */
 {{common_header}}
 ////////////////////////////////////////////////////////////////////////////
 /*
 *  Precalculate derivatives
 */
 {%- if uniform_dx %}
 ftype inv_dx = 1.0 / {{uniform_dx}};
 ftype inv_dy = 1.0 / {{uniform_dx}};
 ftype inv_dz = 1.0 / {{uniform_dx}};
 {%- else %}
 ftype inv_dx = inv_dex[x];
 ftype inv_dy = inv_dey[y];
 ftype inv_dz = inv_dez[z];
 {%- endif %}
 ftype dExy = (Ex[i + py] - Ex[i]) * inv_dy;
 ftype dExz = (Ex[i + pz] - Ex[i]) * inv_dz;
 ftype dEyx = (Ey[i + px] - Ey[i]) * inv_dx;
 ftype dEyz = (Ey[i + pz] - Ey[i]) * inv_dz;
 ftype dEzx = (Ez[i + px] - Ez[i]) * inv_dx;
 ftype dEzy = (Ez[i + py] - Ez[i]) * inv_dy;
 {% for bloch in bloch_boundaries -%}
    {%- set r = bloch['axis'] -%}
    {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
 if ({{r}} == s{{r}} - 1) {
    ftype bloch_re = {{bloch['real']}};
    ftype bloch_im = {{bloch['imag']}};
    dE{{u ~ r}} = bloch_re * dE{{u ~ r}} + bloch_im * (F{{u}}[i + p{{u}}] - F{{u}}[i]);
    dE{{v ~ r}} = bloch_re * dE{{v ~ r}} + bloch_im * (F{{v}}[i + p{{v}}] - F{{v}}[i]);
 }
 {% endfor -%}
 {%- if do_poynting %}
 /*
 *  Precalculate averaged E
 */
 ftype aExy = Ex[i + py] + Ex[i];
 ftype aExz = Ex[i + pz] + Ex[i];
 ftype aEyx = Ey[i + px] + Ey[i];
 ftype aEyz = Ey[i + pz] + Ey[i];
 ftype aEzx = Ez[i + px] + Ez[i];
 ftype aEzy = Ez[i + py] + Ez[i];
 {%- endif %}
 /*
 *  PML Update
 */
 // PML contributions to H
 ftype pHxi = 0;
 ftype pHyi = 0;
 ftype pHzi = 0;
 {% for pml in pmls -%}
    {%- set r = pml['axis'] -%}
    {%- set p = pml['polarity'] -%}
    {%- set u, v = ['x', 'y', 'z'] | reject('equalto', r) -%}
    {%- set psi = 'Psi_' ~ r ~ p ~ '_H' -%}
    {%- if r != 'y' -%}
        {%- set se, sh = '-', '+' -%}
    {%- else -%}
        {%- set se, sh = '+', '-' -%}
    {%- endif %}
 int pml_{{r ~ p}}_thickness = {{pml['thickness']}};
    {%- if p == 'n' %}
 if ( {{r}} < pml_{{r ~ p}}_thickness ) {
    const size_t ir = {{r}};                          // index into pml parameters
    {%- elif p == 'p' %}
 if ( s{{r}} > {{r}} && {{r}} >= s{{r}} - pml_{{r ~ p}}_thickness ) {
    const size_t ir = (s{{r}} - 1) - {{r}};                     // index into pml parameters
    {%- endif %}
    const size_t ip = {{v}} + {{u}} * s{{v}} + ir * s{{v}} * s{{u}};  // linear index into Psi
    dE{{v ~ r}} *= p{{r}}2h{{p}}[ir];
    dE{{u ~ r}} *= p{{r}}2h{{p}}[ir];
    {{psi ~ u}}[ip] = p{{r}}0h{{p}}[ir] * {{psi ~ u}}[ip] + p{{r}}1h{{p}}[ir] * dE{{v ~ r}};
    {{psi ~ v}}[ip] = p{{r}}0h{{p}}[ir] * {{psi ~ v}}[ip] + p{{r}}1h{{p}}[ir] * dE{{u ~ r}};
    pH{{u}}i {{sh}}= {{psi ~ u}}[ip];
    pH{{v}}i {{se}}= {{psi ~ v}}[ip];
 }
 {%- endfor %}
 /*
 *  Update H
 */
 {% if do_poynting -%}
 // Save old H for averaging
 ftype Hx_old = Hx[i];
 ftype Hy_old = Hy[i];
 ftype Hz_old = Hz[i];
 {%- endif %}
 // H update equations
 Hx[i] -= dt * (dEzy - dEyz - pHxi);
 Hy[i] -= dt * (dExz - dEzx - pHyi);
 Hz[i] -= dt * (dEyx - dExy - pHzi);
 {% if do_poynting -%}
 // Average H across timesteps
 ftype aHxt = Hx[i] + Hx_old;
 ftype aHyt = Hy[i] + Hy_old;
 ftype aHzt = Hz[i] + Hz_old;
 /*
 *  Calculate unscaled S components at H locations
 */
 __global ftype *oSxy = oS + 0 * field_size;
 __global ftype *oSyz = oS + 1 * field_size;
 __global ftype *oSzx = oS + 2 * field_size;
 __global ftype *oSxz = oS + 3 * field_size;
 __global ftype *oSyx = oS + 4 * field_size;
 __global ftype *oSzy = oS + 5 * field_size;
 oSxy[i] = aEyx * aHzt;
 oSxz[i] = -aEzx * aHyt;
 oSyz[i] = aEzy * aHxt;
 oSyx[i] = -aExy * aHzt;
 oSzx[i] = aExz * aHyt;
 oSzy[i] = -aEyz * aHxt;
 {%- endif -%}
--- a/opencl_fdtd/kernels/update_j.cl
+++ b/opencl_fdtd/kernels/update_j.cl
@ -1,32 +0,0 @@
 /*
 *  Update E-field from J field 
 *
 *  Template parameters:
 *   common_header: Rendered contents of common.cl
 *
 *  OpenCL args:
 *   E, Jr, Ji, c, s, xmin, xmax, ymin, ymax, zmin, zmax   
 */
 {{common_header}}
 ////////////////////////////////////////////////////////////////////////////
 __global ftype *Jrx = Jr + XX;
 __global ftype *Jry = Jr + YY;
 __global ftype *Jrz = Jr + ZZ;
 __global ftype *Jix = Ji + XX;
 __global ftype *Jiy = Ji + YY;
 __global ftype *Jiz = Ji + ZZ;
 if (x < xmin || y < ymin || z < zmin) {
   PYOPENCL_ELWISE_CONTINUE;
 }
 if (x >= xmax || y >= ymax || z >= zmax) {
   PYOPENCL_ELWISE_CONTINUE;
 }
 Ex[i] += c * Jrx[i] + s * Jix[i];
 Ey[i] += c * Jry[i] + s * Jiy[i];
 Ez[i] += c * Jrz[i] + s * Jiz[i];
--- a/opencl_fdtd/kernels/update_s.cl
+++ b/opencl_fdtd/kernels/update_s.cl
@ -1,36 +0,0 @@
 /*
 *  Update E-field, including any PMLs.
 *
 *  Template parameters:
 *   common_header: Rendered contents of common.cl
 *   pmls: [('x', 'n'), ('z', 'p'),...] list of pml axes and polarities
 *   pml_thickness: Number of cells (integer)
 *
 *  OpenCL args:
 *   E, H, dt, S, oS
 */
 {{common_header}}
 //////////////////////////////////////////////////////////////////////
 /*
 * Calculate S from oS (pre-calculated components)
 */
 __global ftype *Sx = S + XX;
 __global ftype *Sy = S + YY;
 __global ftype *Sz = S + ZZ;
 // Use unscaled S components from H locations
 __global ftype *oSxy = oS + 0 * field_size;
 __global ftype *oSyz = oS + 1 * field_size;
 __global ftype *oSzx = oS + 2 * field_size;
 __global ftype *oSxz = oS + 3 * field_size;
 __global ftype *oSyx = oS + 4 * field_size;
 __global ftype *oSzy = oS + 5 * field_size;
 ftype s_factor = dt * 0.125;
 Sx[i] = (oSxy[i] + oSxz[i] + oSxy[i + my] + oSxz[i + mz]) * s_factor;
 Sy[i] = (oSyz[i] + oSyx[i] + oSyz[i + mz] + oSyx[i + mx]) * s_factor;
 Sz[i] = (oSzx[i] + oSzy[i] + oSzx[i + mx] + oSzy[i + my]) * s_factor;
--- a/opencl_fdtd/simulation.py
+++ b/opencl_fdtd/simulation.py
@ -1,376 +0,0 @@
 """
 Class for constructing and holding the basic FDTD operations and fields
 """
 from typing import List, Dict, Callable
 from collections import OrderedDict
 import numpy
 import jinja2
 import warnings
 import pyopencl
 import pyopencl.array
 from pyopencl.elementwise import ElementwiseKernel
 from fdfd_tools import vec
 __author__ = 'Jan Petykiewicz'
 # Create jinja2 env on module load
 jinja_env = jinja2.Environment(loader=jinja2.PackageLoader(__name__, 'kernels'))
 class Simulation(object):
    """
    Constructs and holds the basic FDTD operations and related fields
    After constructing this object, call the (update_E, update_H, update_S) members
     to perform FDTD updates on the stored (E, H, S) fields:
        pmls = [{'axis': a, 'polarity': p} for a in 'xyz' for p in 'np']
        sim = Simulation(grid.grids, do_poynting=True, pmls=pmls)
        with open('sources.c', 'w') as f:
            f.write('{}'.format(sim.sources))
        for t in range(max_t):
            sim.update_E([]).wait()
            # Find the linear index for the center point, for Ey
            ind = numpy.ravel_multi_index(tuple(grid.shape//2), dims=grid.shape, order='C') + \
                    numpy.prod(grid.shape) * 1
            # Perturb the field (i.e., add a soft current source)
            sim.E[ind] += numpy.sin(omega * t * sim.dt)
            event = sim.update_H([])
            if sim.update_S:
                event = sim.update_S([event])
            event.wait()
            with lzma.open('saved_simulation', 'wb') as f:
                dill.dump(fdfd_tools.unvec(sim.E.get(), grid.shape), f)
    Code in the form
        event2 = sim.update_H([event0, event1])
     indicates that the update_H operation should be prepared immediately, but wait for
     event0 and event1 to occur (i.e. previous operations to finish) before starting execution.
     event2 can then be used to prepare further operations to be run after update_H.
    """
    E = None            # type: pyopencl.array.Array
    H = None            # type: pyopencl.array.Array
    S = None            # type: pyopencl.array.Array
    eps = None          # type: pyopencl.array.Array
    dt = None           # type: float
    inv_dxes = None     # type: List[pyopencl.array.Array]
    arg_type = None     # type: numpy.float32 or numpy.float64
    context = None      # type: pyopencl.Context
    queue = None        # type: pyopencl.CommandQueue
    update_E = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
    update_H = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
    update_S = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
    update_J = None     # type: Callable[[List[pyopencl.Event]], pyopencl.Event]
    sources = None      # type: Dict[str, str]
    def __init__(self,
                 epsilon: List[numpy.ndarray],
                 pmls: List[Dict[str, int or float]],
                 bloch_boundaries: List[Dict[str, int or float]] = (),
                 dxes: List[List[numpy.ndarray]] or float = None,
                 dt: float = None,
                 initial_fields: Dict[str, List[numpy.ndarray]] = None,
                 context: pyopencl.Context = None,
                 queue: pyopencl.CommandQueue = None,
                 float_type: numpy.float32 or numpy.float64 = numpy.float32,
                 do_poynting: bool = True,
                 do_fieldsrc: bool = False):
        """
        Initialize the simulation.
        :param epsilon: List containing [eps_r,xx, eps_r,yy, eps_r,zz], where each element is a Yee-shifted ndarray
                spanning the simulation domain. Relative epsilon is used.
        :param pmls: List of dicts with keys:
            'axis': One of 'x', 'y', 'z'.
            'direction': One of 'n', 'p'.
            'thickness': Number of layers, default 8.
            'epsilon_eff': Effective epsilon to match to. Default 1.0.
            'mu_eff': Effective mu to match to. Default 1.0.
            'ln_R_per_layer': Desired (ln(R) / thickness) value. Default -1.6.
            'm': Polynomial grading exponent. Default 3.5.
            'ma': Exponent for alpha. Default 1.
        :param bloch_boundaries: List of dicts with keys:
            'axis': One of 'x', 'y', 'z'.
            'real': Real part of bloch phase factor (i.e. real(exp(i * phase)))
            'imag': Imaginary part of bloch phase factor (i.e. imag(exp(i * phase)))
        :param dt: Time step. Default is min(dxes) * .99/sqrt(3).
        :param initial_fields: Dict with optional keys ('E', 'H', 'F', 'G') containing initial values for the
            specified fields (default is 0 everywhere). Fields have same format as epsilon.
        :param context: pyOpenCL context. If not given, pyopencl.create_some_context(False) is called.
        :param queue: pyOpenCL command queue. If not given, pyopencl.CommandQueue(context) is called.
        :param float_type: numpy.float32 or numpy.float64. Default numpy.float32.
        :param do_poynting: If true, enables calculation of the poynting vector, S.
                Poynting vector calculation adds the following computational burdens:
                    * During update_H, ~6 extra additions/cell are performed in order to spatially
                        average E and temporally average H. These quantities are multiplied
                        (6 multiplications/cell) and then stored (6 writes/cell, cache-friendly).
                    * update_S performs a discrete cross product using the precalculated products
                        from update_H. This is not nice to the cache and similar to e.g. update_E
                        in complexity.
                    * GPU memory requirements are approximately doubled, since S and the intermediate
                        products must be stored.
        """
        if initial_fields is None:
            initial_fields = {}
        self.shape = epsilon[0].shape
        self.arg_type = float_type
        self.sources = {}
        self._create_context(context, queue)
        self._create_eps(epsilon)
        if dxes is None:
            dxes = 1.0
        if isinstance(dxes, (float, int)):
            uniform_dx = dxes
            min_dx = dxes
        else:
            uniform_dx = False
            self.inv_dxes = [self._create_field(1 / dxn) for dxn in dxes[0] + dxes[1]]
            min_dx = min(min(dxn) for dxn in dxes[0] + dxes[1])
        max_dt = min_dx * .99 / numpy.sqrt(3)
        if dt is None:
            self.dt = max_dt
        elif dt > max_dt:
            warnings.warn('Warning: unstable dt: {}'.format(dt))
        elif dt <= 0:
            raise Exception('Invalid dt: {}'.format(dt))
        else:
            self.dt = dt
        self.E = self._create_field(initial_fields.get('E', None))
        self.H = self._create_field(initial_fields.get('H', None))
        if bloch_boundaries:
            self.F = self._create_field(initial_fields.get('F', None))
            self.G = self._create_field(initial_fields.get('G', None))
        for pml in pmls:
            pml.setdefault('thickness', 8)
            pml.setdefault('epsilon_eff', 1.0)
            pml.setdefault('mu_eff', 1.0)
            pml.setdefault('ln_R_per_layer', -1.6)
            pml.setdefault('m', 3.5)
            pml.setdefault('cfs_alpha', 0)
            pml.setdefault('ma', 1)
        ctype = type_to_C(self.arg_type)
        def ptr(arg: str) -> str:
            return ctype + ' * restrict ' + arg
        base_fields = OrderedDict()
        base_fields[ptr('E')] = self.E
        base_fields[ptr('H')] = self.H
        base_fields[ctype + ' dt'] = self.dt
        if uniform_dx == False:
            inv_dx_names = ['inv_d' + eh + r for eh in 'eh' for r in 'xyz']
            for name, field in zip(inv_dx_names, self.inv_dxes):
                base_fields[ptr(name)] = field
        eps_field = OrderedDict()
        eps_field[ptr('eps')] = self.eps
        if bloch_boundaries:
            base_fields[ptr('F')] = self.F
            base_fields[ptr('G')] = self.G
            bloch_fields = OrderedDict()
            bloch_fields[ptr('E')] = self.F
            bloch_fields[ptr('H')] = self.G
            bloch_fields[ctype + ' dt'] = self.dt
            bloch_fields[ptr('F')] = self.E
            bloch_fields[ptr('G')] = self.H
        common_source = jinja_env.get_template('common.cl').render(
                ftype=ctype,
                shape=self.shape,
                )
        jinja_args = {
                'common_header': common_source,
                'pmls': pmls,
                'do_poynting': do_poynting,
                'bloch': bloch_boundaries,
                'uniform_dx': uniform_dx,
                }
        E_source = jinja_env.get_template('update_e.cl').render(**jinja_args)
        H_source = jinja_env.get_template('update_h.cl').render(**jinja_args)
        self.sources['E'] = E_source
        self.sources['H'] = H_source
        if bloch_boundaries:
            bloch_args = jinja_args.copy()
            bloch_args['do_poynting'] = False
            bloch_args['bloch'] = [{'axis': b['axis'],
                                    'real': b['imag'],
                                    'imag': b['real']}
                                   for b in bloch_boundaries]
            F_source = jinja_env.get_template('update_e.cl').render(**bloch_args)
            G_source = jinja_env.get_template('update_h.cl').render(**bloch_args)
            self.sources['F'] = F_source
            self.sources['G'] = G_source
        S_fields = OrderedDict()
        if do_poynting:
            S_source = jinja_env.get_template('update_s.cl').render(**jinja_args)
            self.sources['S'] = S_source
            self.oS = pyopencl.array.zeros(self.queue, self.E.shape + (2,), dtype=self.arg_type)
            self.S = pyopencl.array.zeros_like(self.E)
            S_fields[ptr('oS')] = self.oS
            S_fields[ptr('S')] = self.S
        J_fields = OrderedDict()
        if do_fieldsrc:
            J_source = jinja_env.get_template('update_j.cl').render(**jinja_args)
            self.sources['J'] = J_source
            self.Ji = pyopencl.array.zeros_like(self.E)
            self.Jr = pyopencl.array.zeros_like(self.E)
            J_fields[ptr('Jr')] = self.Jr
            J_fields[ptr('Ji')] = self.Ji
        '''
        PML
        '''
        pml_e_fields, pml_h_fields = self._create_pmls(pmls)
        if bloch_boundaries:
            pml_f_fields, pml_g_fields = self._create_pmls(pmls)
        '''
        Create operations
        '''
        self.update_E = self._create_operation(E_source, (base_fields, eps_field, pml_e_fields))
        self.update_H = self._create_operation(H_source, (base_fields, pml_h_fields, S_fields))
        if do_poynting:
            self.update_S = self._create_operation(S_source, (base_fields, S_fields))
        if bloch_boundaries:
            self.update_F = self._create_operation(F_source, (bloch_fields, eps_field, pml_f_fields))
            self.update_G = self._create_operation(G_source, (bloch_fields, pml_g_fields))
        if do_fieldsrc:
            args = OrderedDict()
            [args.update(d) for d in (base_fields, J_fields)]
            var_args = [ctype + ' ' + v for v in 'cs'] + ['uint ' + r + m for r in 'xyz' for m in ('min', 'max')]
            update = ElementwiseKernel(self.context, operation=J_source,
                                       arguments=', '.join(list(args.keys()) + var_args))
            self.update_J = lambda e, *a: update(*args.values(), *a, wait_for=e)
    def _create_pmls(self, pmls):
        ctype = type_to_C(self.arg_type)
        def ptr(arg: str) -> str:
            return ctype + ' *' + arg
        pml_e_fields = OrderedDict()
        pml_h_fields = OrderedDict()
        for pml in pmls:
            a = 'xyz'.find(pml['axis'])
            sigma_max = -pml['ln_R_per_layer'] / 2 * (pml['m'] + 1)
            kappa_max = numpy.sqrt(pml['mu_eff'] * pml['epsilon_eff'])
            alpha_max = pml['cfs_alpha']
            def par(x):
                scaling = ((x / (pml['thickness'])) ** pml['m'])
                sigma = scaling * sigma_max
                kappa = 1 + scaling * (kappa_max - 1)
                alpha = ((1 - x / pml['thickness']) ** pml['ma']) * alpha_max
                p0 = numpy.exp(-(sigma / kappa + alpha) * self.dt)
                p1 = sigma / (sigma + kappa * alpha) * (p0 - 1)
                p2 = 1/kappa
                return p0, p1, p2
            xe, xh = (numpy.arange(1, pml['thickness'] + 1, dtype=self.arg_type)[::-1] for _ in range(2))
            if pml['polarity'] == 'p':
                xe -= 0.5
            elif pml['polarity'] == 'n':
                xh -= 0.5
            pml_p_names = [['p' + pml['axis'] + i + eh + pml['polarity'] for i in '012'] for eh in 'eh']
            for name_e, name_h, pe, ph in zip(pml_p_names[0], pml_p_names[1], par(xe), par(xh)):
                pml_e_fields[ptr(name_e)] = pyopencl.array.to_device(self.queue, pe)
                pml_h_fields[ptr(name_h)] = pyopencl.array.to_device(self.queue, ph)
            uv = 'xyz'.replace(pml['axis'], '')
            psi_base = 'Psi_' + pml['axis'] + pml['polarity'] + '_'
            psi_names = [[psi_base + eh + c for c in uv] for eh in 'EH']
            psi_shape = list(self.shape)
            psi_shape[a] = pml['thickness']
            for ne, nh in zip(*psi_names):
                pml_e_fields[ptr(ne)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)
                pml_h_fields[ptr(nh)] = pyopencl.array.zeros(self.queue, tuple(psi_shape), dtype=self.arg_type)
        return pml_e_fields, pml_h_fields
    def _create_operation(self, source, args_fields):
        args = OrderedDict()
        [args.update(d) for d in args_fields]
        update = ElementwiseKernel(self.context, operation=source,
                                   arguments=', '.join(args.keys()))
        return lambda e: update(*args.values(), wait_for=e)
    def _create_context(self, context: pyopencl.Context = None,
                        queue: pyopencl.CommandQueue = None):
        if context is None:
            self.context = pyopencl.create_some_context()
        else:
            self.context = context
        if queue is None:
            self.queue = pyopencl.CommandQueue(self.context)
        else:
            self.queue = queue
    def _create_eps(self, epsilon: List[numpy.ndarray]):
        if len(epsilon) != 3:
            raise Exception('Epsilon must be a list with length of 3')
        if not all((e.shape == epsilon[0].shape for e in epsilon[1:])):
            raise Exception('All epsilon grids must have the same shape. Shapes are {}', [e.shape for e in epsilon])
        if not epsilon[0].shape == self.shape:
            raise Exception('Epsilon shape mismatch. Expected {}, got {}'.format(self.shape, epsilon[0].shape))
        self.eps = pyopencl.array.to_device(self.queue, vec(epsilon).astype(self.arg_type))
    def _create_field(self, initial_value: List[numpy.ndarray] = None):
        if initial_value is None:
            return pyopencl.array.zeros_like(self.eps)
        else:
            if len(initial_value) != 3:
                Exception('Initial field value must be a list of length 3')
            if not all((f.shape == self.shape for f in initial_value)):
                Exception('Initial field list elements must have same shape as epsilon elements')
            return pyopencl.array.to_device(self.queue, vec(initial_value).astype(self.arg_type))
 def type_to_C(float_type) -> str:
    """
    Returns a string corresponding to the C equivalent of a numpy type.
    Only works for float16, float32, float64.
    :param float_type: e.g. numpy.float32
    :return: string containing the corresponding C type (eg. 'double')
    """
    if float_type == numpy.float16:
        arg_type = 'half'
    elif float_type == numpy.float32:
        arg_type = 'float'
    elif float_type == numpy.float64:
        arg_type = 'double'
    else:
        raise Exception('Unsupported type')
    return arg_type
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,6 @@
 numpy
 h5py
 pyopencl
-jinja2
+-e git+https://mpxd.net/gogs/jan/float_raster.git@release#egg=float_raster
-e git+https://mpxd.net/code/jan/float_raster.git@release#egg=float_raster
+-e git+https://mpxd.net/gogs/jan/gridlock.git@release#egg=gridlock
-e git+https://mpxd.net/code/jan/gridlock.git@release#egg=gridlock
+-e git+https://mpxd.net/gogs/jan/masque.git@release#egg=masque
 -e git+https://mpxd.net/code/jan/masque.git@release#egg=masque
 -e git+https://mpxd.net/code/jan/fdfd_tools.git@master#egg=fdfd_tools
--- a/setup.py
+++ b/setup.py
@ -1,30 +0,0 @@
 #!/usr/bin/env python3
 from setuptools import setup, find_packages
 import opencl_fdtd
 with open('README.md', 'r') as f:
    long_description = f.read()
 setup(name='opencl_fdtd',
      version=opencl_fdtd.version,
      description='OpenCL FDTD solver',
      long_description=long_description,
      long_description_content_type='text/markdown',
      author='Jan Petykiewicz',
      author_email='anewusername@gmail.com',
      url='https://mpxd.net/code/jan/opencl_fdtd',
      packages=find_packages(),
      package_data={
          'opencl_fdfd': ['kernels/*']
      },
      install_requires=[
            'numpy',
            'pyopencl',
            'jinja2',
            'fdfd_tools>=0.3',
      ],
      extras_require={
      },
      )