diff --git a/README.md b/README.md
index fc90afb..4901949 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,37 @@
 # opencl_fdfd
 
-OpenCL FDFD solver
\ No newline at end of file
+**opencl_fdfd** is a 3D Finite Difference Frequency Domain (FDFD)
+solver implemented in Python and OpenCL.
+  
+**Capabilities**
+* Arbitrary distributions of the following:
+    * Dielectric constant (epsilon)
+    * Magnetic permeabilty (mu)
+    * Perfect electric conductor (PEC)
+    * Perfect magnetic conductor (PMC)
+* Variable-sized rectangular grids
+    * Stretched-coordinate PMLs (complex cell sizes allowed)
+
+Currently, only periodic boundary conditions are included.
+PEC/PMC boundaries can be implemented by drawing PEC/PMC cells near the edges.
+Bloch boundary conditions are not included but wouldn't be very hard to add.
+
+The default solver (opencl_fdfd.cg_solver(...)) located in main.py implements
+the E-field wave operator directly (ie, as a list of OpenCL instructions
+rather than a matrix). Additionally, there is a slower (and slightly more
+versatile) sovler in csr.py which attempts to solve an arbitrary sparse
+matrix in compressed sparse row (CSR) format using the same conjugate gradient
+method as the default solver. The CSR solver is significantly slower, but can
+be very useful for testing alternative formulations of the FDFD wave equation.
+
+Currently, this solver only uses a single GPU or other OpenCL accelerator;
+generalization to multiple GPUs should be pretty straightforward
+(ie, just copy over edge values during the matrix multiplication step).
+
+
+**Dependencies:**
+* python 3 (written and tested with 3.5) 
+* numpy
+* pyopencl
+* jinja2
+* [fdfd_tools](https://mpxd.net/gogs/jan/fdfd_tools)
\ No newline at end of file
diff --git a/opencl_fdfd/__init__.py b/opencl_fdfd/__init__.py
index b2f1f79..570efba 100644
--- a/opencl_fdfd/__init__.py
+++ b/opencl_fdfd/__init__.py
@@ -1 +1,42 @@
-from .main import cg_solver
\ No newline at end of file
+"""
+ opencl_fdfd OpenCL 3D FDFD solver
+
+ opencl_fdfd is a 3D Finite Difference Frequency Domain (FDFD) solver implemented in
+  Python and OpenCL. Its capabilities include:
+
+  - Arbitrary distributions of the following:
+    - Dielectric constant (epsilon)
+    - Magnetic permeabilty (mu)
+    - Perfect electric conductor (PEC)
+    - Perfect magnetic conductor (PMC)
+  - Variable-sized rectangular grids
+    - Stretched-coordinate PMLs (complex cell sizes allowed)
+
+  Currently, only periodic boundary conditions are included.
+  PEC/PMC boundaries can be implemented by drawing PEC/PMC cells near the edges.
+  Bloch boundary conditions are not included but wouldn't be very hard to add.
+
+  The default solver (opencl_fdfd.cg_solver(...)) located in main.py implements
+   the E-field wave operator directly (ie, as a list of OpenCL instructions
+   rather than a matrix). Additionally, there is a slower (and slightly more
+   versatile) sovler in csr.py which attempts to solve an arbitrary sparse
+   matrix in compressed sparse row (CSR) format using the same conjugate gradient
+   method as the default solver. The CSR solver is significantly slower, but can
+   be very useful for testing alternative formulations of the FDFD wave equation.
+
+  Currently, this solver only uses a single GPU or other OpenCL accelerator; generalization
+  to multiple GPUs should be pretty straightforward (ie, just copy over edge values during the
+  matrix multiplication step).
+
+
+  Dependencies:
+    - fdfd_tools    ( https://mpxd.net/gogs/jan/fdfd_tools )
+    - numpy
+    - pyopencl
+    - jinja2
+"""
+
+from .main import cg_solver
+
+__author__ = 'Jan Petykiewicz'
+
diff --git a/opencl_fdfd/csr.py b/opencl_fdfd/csr.py
index 82347ed..7deec66 100644
--- a/opencl_fdfd/csr.py
+++ b/opencl_fdfd/csr.py
@@ -1,27 +1,53 @@
+from typing import List, Dict, Any
+import time
+
 import numpy
 from numpy.linalg import norm
 import pyopencl
 import pyopencl.array
 
-import time
-
 import fdfd_tools.operators
 
 from . import ops
 
 
 class CSRMatrix(object):
+    """
+    Matrix stored in Compressed Sparse Row format, in GPU RAM.
+    """
     row_ptr = None      # type: pyopencl.array.Array
     col_ind = None      # type: pyopencl.array.Array
     data = None         # type: pyopencl.array.Array
 
-    def __init__(self, queue, m):
+    def __init__(self,
+                 queue: pyopencl.CommandQueue,
+                 m: 'scipy.sparse.csr_matrix'):
         self.row_ptr = pyopencl.array.to_device(queue, m.indptr)
         self.col_ind = pyopencl.array.to_device(queue, m.indices)
         self.data = pyopencl.array.to_device(queue, m.data.astype(numpy.complex128))
 
 
-def cg(a, b, max_iters=10000, err_thresh=1e-6, context=None, queue=None, verbose=False):
+def cg(a: 'scipy.sparse.csr_matrix',
+       b: numpy.ndarray,
+       max_iters: int = 10000,
+       err_threshold: float = 1e-6,
+       context: pyopencl.Context = None,
+       queue: pyopencl.CommandQueue = None,
+       verbose: bool = False,
+       ) -> numpy.ndarray:
+    """
+    General conjugate-gradient solver for sparse matrices, where A @ x = b.
+
+    :param a: Matrix to solve (CSR format)
+    :param b: Right-hand side vector (dense ndarray)
+    :param max_iters: Maximum number of iterations
+    :param err_threshold: Error threshold for successful solve, relative to norm(b)
+    :param context: PyOpenCL context. Will be created if not given.
+    :param queue: PyOpenCL command queue. Will be created if not given.
+    :param verbose: Whether to print statistics to screen.
+    :return: Solution vector x; returned even if solve doesn't converge.
+    """
+
     start_time = time.perf_counter()
 
     if context is None:
@@ -44,7 +70,6 @@ def cg(a, b, max_iters=10000, err_thresh=1e-6, context=None, queue=None, verbose
 
     m = CSRMatrix(queue, a)
 
-
     '''
     Generate OpenCL kernels
     '''
@@ -77,7 +102,7 @@ def cg(a, b, max_iters=10000, err_thresh=1e-6, context=None, queue=None, verbose
         if verbose:
             print('err', errs[-1])
 
-        if errs[-1] < err_thresh:
+        if errs[-1] < err_threshold:
             success = True
             break
 
@@ -108,7 +133,38 @@ def cg(a, b, max_iters=10000, err_thresh=1e-6, context=None, queue=None, verbose
     return x
 
 
-def cg_solver(omega, dxes, J, epsilon, mu=None, pec=None, pmc=None, adjoint=False, solver_opts=None):
+def cg_solver(omega: complex,
+              dxes: List[List[numpy.ndarray]],
+              J: numpy.ndarray,
+              epsilon: numpy.ndarray,
+              mu: numpy.ndarray = None,
+              pec: numpy.ndarray = None,
+              pmc: numpy.ndarray = None,
+              adjoint: bool = False,
+              solver_opts: Dict[str, Any] = None,
+              ) -> numpy.ndarray:
+    """
+    Conjugate gradient FDFD solver using CSR sparse matrices, mainly for
+     testing and development since it's much slower than the solver in main.py.
+
+    All ndarray arguments should be 1D arrays. To linearize a list of 3 3D ndarrays,
+     either use fdfd_tools.vec() or numpy:
+     f_1D = numpy.hstack(tuple((fi.flatten(order='F') for fi in [f_x, f_y, f_z])))
+
+    :param omega: Complex frequency to solve at.
+    :param dxes: [[dx_e, dy_e, dz_e], [dx_h, dy_h, dz_h]] (complex cell sizes)
+    :param J: Electric current distribution (at E-field locations)
+    :param epsilon: Dielectric constant distribution (at E-field locations)
+    :param mu: Magnetic permeability distribution (at H-field locations)
+    :param pec: Perfect electric conductor distribution
+        (at E-field locations; non-zero value indicates PEC is present)
+    :param pmc: Perfect magnetic conductor distribution
+        (at H-field locations; non-zero value indicates PMC is present)
+    :param adjoint: If true, solves the adjoint problem.
+    :param solver_opts: Passed as kwargs to opencl_fdfd.csr.cg(**solver_opts)
+    :return: E-field which solves the system.
+    """
+
     if solver_opts is None:
         solver_opts = dict()
 
diff --git a/opencl_fdfd/main.py b/opencl_fdfd/main.py
index 716f403..f9c9c33 100644
--- a/opencl_fdfd/main.py
+++ b/opencl_fdfd/main.py
@@ -1,17 +1,58 @@
+from typing import List
+import time
+
 import numpy
 from numpy.linalg import norm
 import pyopencl
 import pyopencl.array
 
-import time
-
 import fdfd_tools.operators
 
 from . import ops
 
+__author__ = 'Jan Petykiewicz'
+
+
+def cg_solver(omega: complex,
+              dxes: List[List[numpy.ndarray]],
+              J: numpy.ndarray,
+              epsilon: numpy.ndarray,
+              mu: numpy.ndarray = None,
+              pec: numpy.ndarray = None,
+              pmc: numpy.ndarray = None,
+              adjoint: bool = False,
+              max_iters: int = 40000,
+              err_threshold: float = 1e-6,
+              context: pyopencl.Context = None,
+              verbose: bool = False,
+              ) -> numpy.ndarray:
+    """
+    OpenCL FDFD solver using the iterative conjugate gradient (cg) method
+     and implementing the diagonalized E-field wave operator directly in
+     OpenCL.
+
+    All ndarray arguments should be 1D arrays. To linearize a list of 3 3D ndarrays,
+     either use fdfd_tools.vec() or numpy:
+     f_1D = numpy.hstack(tuple((fi.flatten(order='F') for fi in [f_x, f_y, f_z])))
+
+    :param omega: Complex frequency to solve at.
+    :param dxes: [[dx_e, dy_e, dz_e], [dx_h, dy_h, dz_h]] (complex cell sizes)
+    :param J: Electric current distribution (at E-field locations)
+    :param epsilon: Dielectric constant distribution (at E-field locations)
+    :param mu: Magnetic permeability distribution (at H-field locations)
+    :param pec: Perfect electric conductor distribution
+        (at E-field locations; non-zero value indicates PEC is present)
+    :param pmc: Perfect magnetic conductor distribution
+        (at H-field locations; non-zero value indicates PMC is present)
+    :param adjoint: If true, solves the adjoint problem.
+    :param max_iters: Maximum number of iterations. Default 40,000.
+    :param err_threshold: If (r @ r.conj()) / norm(1j * omega * J) < err_threshold, success.
+        Default 1e-6.
+    :param context: PyOpenCL context to run in. If not given, construct a new context.
+    :param verbose: If True, print progress to stdout. Default False.
+    :return: E-field which solves the system. Returned even if we did not converge.
+    """
 
-def cg_solver(omega, dxes, J, epsilon, mu=None, pec=None, pmc=None, adjoint=False,
-              max_iters=40000, err_thresh=1e-6, context=None, verbose=False):
     start_time = time.perf_counter()
 
     b = -1j * omega * J
@@ -138,7 +179,7 @@ def cg_solver(omega, dxes, J, epsilon, mu=None, pec=None, pmc=None, adjoint=Fals
         if verbose:
             print('err', errs[-1])
 
-        if errs[-1] < err_thresh:
+        if errs[-1] < err_threshold:
             success = True
             break
 
diff --git a/opencl_fdfd/ops.py b/opencl_fdfd/ops.py
index f0b5433..395af3c 100644
--- a/opencl_fdfd/ops.py
+++ b/opencl_fdfd/ops.py
@@ -1,3 +1,5 @@
+from typing import List, Callable
+
 import numpy
 import jinja2
 
@@ -9,6 +11,9 @@ from pyopencl.reduction import ReductionKernel
 # Create jinja2 env on module load
 jinja_env = jinja2.Environment(loader=jinja2.PackageLoader(__name__, 'kernels'))
 
+# Return type for the create_opname(...) functions
+operation = Callable[..., List[pyopencl.Event]]
+
 
 def type_to_C(float_type: numpy.float32 or numpy.float64) -> str:
     """
@@ -28,9 +33,11 @@ def type_to_C(float_type: numpy.float32 or numpy.float64) -> str:
 
     return types[float_type]
 
+# Type names
 ctype = type_to_C(numpy.complex128)
 ctype_bare = 'cdouble'
 
+# Preamble for all OpenCL code
 preamble = '''
 #define PYOPENCL_DEFINE_CDOUBLE
 #include <pyopencl-complex.h>
@@ -44,11 +51,43 @@ preamble = '''
 '''.format(ctype=ctype_bare)
 
 
-def ptrs(*args):
+def ptrs(*args: str) -> List[str]:
     return [ctype + ' *' + s for s in args]
 
 
-def create_a(context, shape, mu=False, pec=False, pmc=False):
+def create_a(context: pyopencl.Context,
+             shape: numpy.ndarray,
+             mu: bool = False,
+             pec: bool = False,
+             pmc: bool = False,
+             ) -> operation:
+    """
+    Return a function which performs (A @ p), where A is the FDFD wave equation for E-field.
+
+    The returned function has the signature
+    spmv(E, H, p, idxes, oeps, inv_mu, pec, pmc, Pl, Pr, e)
+    with arguments (all except e are of type pyopencl.array.Array (or contain it)):
+     E      E-field (output)
+     H      Temporary variable for holding intermediate H-field values on GPU (same size as E)
+     p      p-vector (input vector)
+     idxes  list holding [[1/dx_e, 1/dy_e, 1/dz_e],  [1/dx_h, 1/dy_h, 1/dz_h]] (complex cell widths)
+     oeps   omega * epsilon
+     inv_mu 1/mu
+     pec    array of bytes; nonzero value indicates presence of PEC
+     pmc    array of bytes; nonzero value indicates presence of PMC
+     Pl     Left preconditioner (array containing diagonal entries only)
+     Pr     Right preconditioner (array containing diagonal entries only)
+     e      List of pyopencl.Event; execution will wait until these are finished.
+
+     and returns a list of pyopencl.Event.
+
+    :param context: PyOpenCL context
+    :param shape: Dimensions of the E-field
+    :param mu: False iff (mu == 1) everywhere
+    :param pec: False iff no PEC anywhere
+    :param pmc: False iff no PMC anywhere
+    :return: Function for computing (A @ p)
+    """
 
     common_source = jinja_env.get_template('common.cl').render(shape=shape)
 
@@ -57,6 +96,9 @@ def create_a(context, shape, mu=False, pec=False, pmc=False):
     des = [ctype + ' *inv_de' + a for a in 'xyz']
     dhs = [ctype + ' *inv_dh' + a for a in 'xyz']
 
+    '''
+    Convert p to initial E (ie, apply right preconditioner and PEC)
+    '''
     p2e_source = jinja_env.get_template('p2e.cl').render(pec=pec)
     P2E_kernel = ElementwiseKernel(context,
                                    name='P2E',
@@ -64,6 +106,9 @@ def create_a(context, shape, mu=False, pec=False, pmc=False):
                                    operation=p2e_source,
                                    arguments=', '.join(ptrs('E', 'p', 'Pr') + pec_arg))
 
+    '''
+    Calculate intermediate H from intermediate E
+    '''
     e2h_source = jinja_env.get_template('e2h.cl').render(mu=mu,
                                                          pmc=pmc,
                                                          common_cl=common_source)
@@ -73,6 +118,9 @@ def create_a(context, shape, mu=False, pec=False, pmc=False):
                                    operation=e2h_source,
                                    arguments=', '.join(ptrs('E', 'H', 'inv_mu') + pmc_arg + des))
 
+    '''
+    Calculate final E (including left preconditioner)
+    '''
     h2e_source = jinja_env.get_template('h2e.cl').render(pec=pec,
                                                          common_cl=common_source)
     H2E_kernel = ElementwiseKernel(context,
@@ -90,7 +138,20 @@ def create_a(context, shape, mu=False, pec=False, pmc=False):
     return spmv
 
 
-def create_xr_step(context):
+def create_xr_step(context: pyopencl.Context) -> operation:
+    """
+    Return a function
+     xr_update(x, p, r, v, alpha, e)
+    which performs the operations
+     x += alpha * p
+     r -= alpha * v
+
+    after waiting for all in the list e
+    and returns a list of pyopencl.Event
+
+    :param context: PyOpenCL context
+    :return: Function for performing x and r updates
+    """
     update_xr_source = '''
     x[i] = add(x[i], mul(alpha, p[i]));
     r[i] = sub(r[i], mul(alpha, v[i]));
@@ -110,13 +171,28 @@ def create_xr_step(context):
     return xr_update
 
 
-def create_rhoerr_step(context):
+def create_rhoerr_step(context: pyopencl.Context) -> operation:
+    """
+    Return a function
+     ri_update(r, e)
+    which performs the operations
+     rho = r * r.conj()
+     err = r * r
+
+    after waiting for all pyopencl.Event in the list e
+    and returns a list of pyopencl.Event
+
+    :param context: PyOpenCL context
+    :return: Function for performing x and r updates
+    """
+
     update_ri_source = '''
     (double3)(r[i].real * r[i].real, \
               r[i].real * r[i].imag, \
               r[i].imag * r[i].imag)
     '''
 
+    # Use a vector type (double3) to make the reduction simpler
     ri_dtype = pyopencl.array.vec.double3
 
     ri_kernel = ReductionKernel(context,
@@ -138,7 +214,19 @@ def create_rhoerr_step(context):
     return ri_update
 
 
-def create_p_step(context):
+def create_p_step(context: pyopencl.Context) -> operation:
+    """
+    Return a function
+     p_update(p, r, beta, e)
+    which performs the operation
+     p = r + beta * p
+
+    after waiting for all pyopencl.Event in the list e
+    and returns a list of pyopencl.Event
+
+    :param context: PyOpenCL context
+    :return: Function for performing the p update
+    """
     update_p_source = '''
     p[i] = add(r[i], mul(beta, p[i]));
     '''
@@ -156,7 +244,16 @@ def create_p_step(context):
     return p_update
 
 
-def create_dot(context):
+def create_dot(context: pyopencl.Context) -> operation:
+    """
+    Return a function for performing the dot product
+     p @ v
+    with the signature
+     dot(p, v, e) -> float
+
+    :param context: PyOpenCL context
+    :return: Function for performing the dot product
+    """
     dot_dtype = numpy.complex128
 
     dot_kernel = ReductionKernel(context,
@@ -168,14 +265,30 @@ def create_dot(context):
                                  reduce_expr='add(a, b)',
                                  arguments=ptrs('p', 'v'))
 
-    def ri_update(p, v, e):
+    def dot(p, v, e):
         g = dot_kernel(p, v, wait_for=e)
         return g.get()
 
-    return ri_update
+    return dot
 
 
-def create_a_csr(context):
+def create_a_csr(context: pyopencl.Context) -> operation:
+    """
+    Return a function for performing the operation
+     (N @ v)
+    where N is stored in CSR (compressed sparse row) format.
+
+    The function signature is
+     spmv(v_out, m, v_in, e)
+    where m is an opencl_fdfd.csr.CSRMatrix
+    and v_out, v_in are (dense) vectors (of type pyopencl.array.Array).
+
+    The function waits on all the pyopencl.Event in e before running, and returns
+     a list of pyopencl.Event.
+
+    :param context: PyOpenCL context
+    :return: Function for sparse (M @ v) operation where M is in CSR format
+    """
     spmv_source = '''
     int start = m_row_ptr[i];
     int stop = m_row_ptr[i+1];