From 89caff471f674188ae58f5aabc7901d9a81e8dc1 Mon Sep 17 00:00:00 2001 From: jan Date: Thu, 4 Aug 2016 17:43:01 -0700 Subject: [PATCH] Documentation! --- README.md | 36 ++++++++++- opencl_fdfd/__init__.py | 43 ++++++++++++- opencl_fdfd/csr.py | 70 ++++++++++++++++++--- opencl_fdfd/main.py | 51 ++++++++++++++-- opencl_fdfd/ops.py | 131 +++++++++++++++++++++++++++++++++++++--- 5 files changed, 308 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index fc90afb..4901949 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,37 @@ # opencl_fdfd -OpenCL FDFD solver \ No newline at end of file +**opencl_fdfd** is a 3D Finite Difference Frequency Domain (FDFD) +solver implemented in Python and OpenCL. + +**Capabilities** +* Arbitrary distributions of the following: + * Dielectric constant (epsilon) + * Magnetic permeabilty (mu) + * Perfect electric conductor (PEC) + * Perfect magnetic conductor (PMC) +* Variable-sized rectangular grids + * Stretched-coordinate PMLs (complex cell sizes allowed) + +Currently, only periodic boundary conditions are included. +PEC/PMC boundaries can be implemented by drawing PEC/PMC cells near the edges. +Bloch boundary conditions are not included but wouldn't be very hard to add. + +The default solver (opencl_fdfd.cg_solver(...)) located in main.py implements +the E-field wave operator directly (ie, as a list of OpenCL instructions +rather than a matrix). Additionally, there is a slower (and slightly more +versatile) sovler in csr.py which attempts to solve an arbitrary sparse +matrix in compressed sparse row (CSR) format using the same conjugate gradient +method as the default solver. The CSR solver is significantly slower, but can +be very useful for testing alternative formulations of the FDFD wave equation. + +Currently, this solver only uses a single GPU or other OpenCL accelerator; +generalization to multiple GPUs should be pretty straightforward +(ie, just copy over edge values during the matrix multiplication step). + + +**Dependencies:** +* python 3 (written and tested with 3.5) +* numpy +* pyopencl +* jinja2 +* [fdfd_tools](https://mpxd.net/gogs/jan/fdfd_tools) \ No newline at end of file diff --git a/opencl_fdfd/__init__.py b/opencl_fdfd/__init__.py index b2f1f79..570efba 100644 --- a/opencl_fdfd/__init__.py +++ b/opencl_fdfd/__init__.py @@ -1 +1,42 @@ -from .main import cg_solver \ No newline at end of file +""" + opencl_fdfd OpenCL 3D FDFD solver + + opencl_fdfd is a 3D Finite Difference Frequency Domain (FDFD) solver implemented in + Python and OpenCL. Its capabilities include: + + - Arbitrary distributions of the following: + - Dielectric constant (epsilon) + - Magnetic permeabilty (mu) + - Perfect electric conductor (PEC) + - Perfect magnetic conductor (PMC) + - Variable-sized rectangular grids + - Stretched-coordinate PMLs (complex cell sizes allowed) + + Currently, only periodic boundary conditions are included. + PEC/PMC boundaries can be implemented by drawing PEC/PMC cells near the edges. + Bloch boundary conditions are not included but wouldn't be very hard to add. + + The default solver (opencl_fdfd.cg_solver(...)) located in main.py implements + the E-field wave operator directly (ie, as a list of OpenCL instructions + rather than a matrix). Additionally, there is a slower (and slightly more + versatile) sovler in csr.py which attempts to solve an arbitrary sparse + matrix in compressed sparse row (CSR) format using the same conjugate gradient + method as the default solver. The CSR solver is significantly slower, but can + be very useful for testing alternative formulations of the FDFD wave equation. + + Currently, this solver only uses a single GPU or other OpenCL accelerator; generalization + to multiple GPUs should be pretty straightforward (ie, just copy over edge values during the + matrix multiplication step). + + + Dependencies: + - fdfd_tools ( https://mpxd.net/gogs/jan/fdfd_tools ) + - numpy + - pyopencl + - jinja2 +""" + +from .main import cg_solver + +__author__ = 'Jan Petykiewicz' + diff --git a/opencl_fdfd/csr.py b/opencl_fdfd/csr.py index 82347ed..7deec66 100644 --- a/opencl_fdfd/csr.py +++ b/opencl_fdfd/csr.py @@ -1,27 +1,53 @@ +from typing import List, Dict, Any +import time + import numpy from numpy.linalg import norm import pyopencl import pyopencl.array -import time - import fdfd_tools.operators from . import ops class CSRMatrix(object): + """ + Matrix stored in Compressed Sparse Row format, in GPU RAM. + """ row_ptr = None # type: pyopencl.array.Array col_ind = None # type: pyopencl.array.Array data = None # type: pyopencl.array.Array - def __init__(self, queue, m): + def __init__(self, + queue: pyopencl.CommandQueue, + m: 'scipy.sparse.csr_matrix'): self.row_ptr = pyopencl.array.to_device(queue, m.indptr) self.col_ind = pyopencl.array.to_device(queue, m.indices) self.data = pyopencl.array.to_device(queue, m.data.astype(numpy.complex128)) -def cg(a, b, max_iters=10000, err_thresh=1e-6, context=None, queue=None, verbose=False): +def cg(a: 'scipy.sparse.csr_matrix', + b: numpy.ndarray, + max_iters: int = 10000, + err_threshold: float = 1e-6, + context: pyopencl.Context = None, + queue: pyopencl.CommandQueue = None, + verbose: bool = False, + ) -> numpy.ndarray: + """ + General conjugate-gradient solver for sparse matrices, where A @ x = b. + + :param a: Matrix to solve (CSR format) + :param b: Right-hand side vector (dense ndarray) + :param max_iters: Maximum number of iterations + :param err_threshold: Error threshold for successful solve, relative to norm(b) + :param context: PyOpenCL context. Will be created if not given. + :param queue: PyOpenCL command queue. Will be created if not given. + :param verbose: Whether to print statistics to screen. + :return: Solution vector x; returned even if solve doesn't converge. + """ + start_time = time.perf_counter() if context is None: @@ -44,7 +70,6 @@ def cg(a, b, max_iters=10000, err_thresh=1e-6, context=None, queue=None, verbose m = CSRMatrix(queue, a) - ''' Generate OpenCL kernels ''' @@ -77,7 +102,7 @@ def cg(a, b, max_iters=10000, err_thresh=1e-6, context=None, queue=None, verbose if verbose: print('err', errs[-1]) - if errs[-1] < err_thresh: + if errs[-1] < err_threshold: success = True break @@ -108,7 +133,38 @@ def cg(a, b, max_iters=10000, err_thresh=1e-6, context=None, queue=None, verbose return x -def cg_solver(omega, dxes, J, epsilon, mu=None, pec=None, pmc=None, adjoint=False, solver_opts=None): +def cg_solver(omega: complex, + dxes: List[List[numpy.ndarray]], + J: numpy.ndarray, + epsilon: numpy.ndarray, + mu: numpy.ndarray = None, + pec: numpy.ndarray = None, + pmc: numpy.ndarray = None, + adjoint: bool = False, + solver_opts: Dict[str, Any] = None, + ) -> numpy.ndarray: + """ + Conjugate gradient FDFD solver using CSR sparse matrices, mainly for + testing and development since it's much slower than the solver in main.py. + + All ndarray arguments should be 1D arrays. To linearize a list of 3 3D ndarrays, + either use fdfd_tools.vec() or numpy: + f_1D = numpy.hstack(tuple((fi.flatten(order='F') for fi in [f_x, f_y, f_z]))) + + :param omega: Complex frequency to solve at. + :param dxes: [[dx_e, dy_e, dz_e], [dx_h, dy_h, dz_h]] (complex cell sizes) + :param J: Electric current distribution (at E-field locations) + :param epsilon: Dielectric constant distribution (at E-field locations) + :param mu: Magnetic permeability distribution (at H-field locations) + :param pec: Perfect electric conductor distribution + (at E-field locations; non-zero value indicates PEC is present) + :param pmc: Perfect magnetic conductor distribution + (at H-field locations; non-zero value indicates PMC is present) + :param adjoint: If true, solves the adjoint problem. + :param solver_opts: Passed as kwargs to opencl_fdfd.csr.cg(**solver_opts) + :return: E-field which solves the system. + """ + if solver_opts is None: solver_opts = dict() diff --git a/opencl_fdfd/main.py b/opencl_fdfd/main.py index 716f403..f9c9c33 100644 --- a/opencl_fdfd/main.py +++ b/opencl_fdfd/main.py @@ -1,17 +1,58 @@ +from typing import List +import time + import numpy from numpy.linalg import norm import pyopencl import pyopencl.array -import time - import fdfd_tools.operators from . import ops +__author__ = 'Jan Petykiewicz' + + +def cg_solver(omega: complex, + dxes: List[List[numpy.ndarray]], + J: numpy.ndarray, + epsilon: numpy.ndarray, + mu: numpy.ndarray = None, + pec: numpy.ndarray = None, + pmc: numpy.ndarray = None, + adjoint: bool = False, + max_iters: int = 40000, + err_threshold: float = 1e-6, + context: pyopencl.Context = None, + verbose: bool = False, + ) -> numpy.ndarray: + """ + OpenCL FDFD solver using the iterative conjugate gradient (cg) method + and implementing the diagonalized E-field wave operator directly in + OpenCL. + + All ndarray arguments should be 1D arrays. To linearize a list of 3 3D ndarrays, + either use fdfd_tools.vec() or numpy: + f_1D = numpy.hstack(tuple((fi.flatten(order='F') for fi in [f_x, f_y, f_z]))) + + :param omega: Complex frequency to solve at. + :param dxes: [[dx_e, dy_e, dz_e], [dx_h, dy_h, dz_h]] (complex cell sizes) + :param J: Electric current distribution (at E-field locations) + :param epsilon: Dielectric constant distribution (at E-field locations) + :param mu: Magnetic permeability distribution (at H-field locations) + :param pec: Perfect electric conductor distribution + (at E-field locations; non-zero value indicates PEC is present) + :param pmc: Perfect magnetic conductor distribution + (at H-field locations; non-zero value indicates PMC is present) + :param adjoint: If true, solves the adjoint problem. + :param max_iters: Maximum number of iterations. Default 40,000. + :param err_threshold: If (r @ r.conj()) / norm(1j * omega * J) < err_threshold, success. + Default 1e-6. + :param context: PyOpenCL context to run in. If not given, construct a new context. + :param verbose: If True, print progress to stdout. Default False. + :return: E-field which solves the system. Returned even if we did not converge. + """ -def cg_solver(omega, dxes, J, epsilon, mu=None, pec=None, pmc=None, adjoint=False, - max_iters=40000, err_thresh=1e-6, context=None, verbose=False): start_time = time.perf_counter() b = -1j * omega * J @@ -138,7 +179,7 @@ def cg_solver(omega, dxes, J, epsilon, mu=None, pec=None, pmc=None, adjoint=Fals if verbose: print('err', errs[-1]) - if errs[-1] < err_thresh: + if errs[-1] < err_threshold: success = True break diff --git a/opencl_fdfd/ops.py b/opencl_fdfd/ops.py index f0b5433..395af3c 100644 --- a/opencl_fdfd/ops.py +++ b/opencl_fdfd/ops.py @@ -1,3 +1,5 @@ +from typing import List, Callable + import numpy import jinja2 @@ -9,6 +11,9 @@ from pyopencl.reduction import ReductionKernel # Create jinja2 env on module load jinja_env = jinja2.Environment(loader=jinja2.PackageLoader(__name__, 'kernels')) +# Return type for the create_opname(...) functions +operation = Callable[..., List[pyopencl.Event]] + def type_to_C(float_type: numpy.float32 or numpy.float64) -> str: """ @@ -28,9 +33,11 @@ def type_to_C(float_type: numpy.float32 or numpy.float64) -> str: return types[float_type] +# Type names ctype = type_to_C(numpy.complex128) ctype_bare = 'cdouble' +# Preamble for all OpenCL code preamble = ''' #define PYOPENCL_DEFINE_CDOUBLE #include @@ -44,11 +51,43 @@ preamble = ''' '''.format(ctype=ctype_bare) -def ptrs(*args): +def ptrs(*args: str) -> List[str]: return [ctype + ' *' + s for s in args] -def create_a(context, shape, mu=False, pec=False, pmc=False): +def create_a(context: pyopencl.Context, + shape: numpy.ndarray, + mu: bool = False, + pec: bool = False, + pmc: bool = False, + ) -> operation: + """ + Return a function which performs (A @ p), where A is the FDFD wave equation for E-field. + + The returned function has the signature + spmv(E, H, p, idxes, oeps, inv_mu, pec, pmc, Pl, Pr, e) + with arguments (all except e are of type pyopencl.array.Array (or contain it)): + E E-field (output) + H Temporary variable for holding intermediate H-field values on GPU (same size as E) + p p-vector (input vector) + idxes list holding [[1/dx_e, 1/dy_e, 1/dz_e], [1/dx_h, 1/dy_h, 1/dz_h]] (complex cell widths) + oeps omega * epsilon + inv_mu 1/mu + pec array of bytes; nonzero value indicates presence of PEC + pmc array of bytes; nonzero value indicates presence of PMC + Pl Left preconditioner (array containing diagonal entries only) + Pr Right preconditioner (array containing diagonal entries only) + e List of pyopencl.Event; execution will wait until these are finished. + + and returns a list of pyopencl.Event. + + :param context: PyOpenCL context + :param shape: Dimensions of the E-field + :param mu: False iff (mu == 1) everywhere + :param pec: False iff no PEC anywhere + :param pmc: False iff no PMC anywhere + :return: Function for computing (A @ p) + """ common_source = jinja_env.get_template('common.cl').render(shape=shape) @@ -57,6 +96,9 @@ def create_a(context, shape, mu=False, pec=False, pmc=False): des = [ctype + ' *inv_de' + a for a in 'xyz'] dhs = [ctype + ' *inv_dh' + a for a in 'xyz'] + ''' + Convert p to initial E (ie, apply right preconditioner and PEC) + ''' p2e_source = jinja_env.get_template('p2e.cl').render(pec=pec) P2E_kernel = ElementwiseKernel(context, name='P2E', @@ -64,6 +106,9 @@ def create_a(context, shape, mu=False, pec=False, pmc=False): operation=p2e_source, arguments=', '.join(ptrs('E', 'p', 'Pr') + pec_arg)) + ''' + Calculate intermediate H from intermediate E + ''' e2h_source = jinja_env.get_template('e2h.cl').render(mu=mu, pmc=pmc, common_cl=common_source) @@ -73,6 +118,9 @@ def create_a(context, shape, mu=False, pec=False, pmc=False): operation=e2h_source, arguments=', '.join(ptrs('E', 'H', 'inv_mu') + pmc_arg + des)) + ''' + Calculate final E (including left preconditioner) + ''' h2e_source = jinja_env.get_template('h2e.cl').render(pec=pec, common_cl=common_source) H2E_kernel = ElementwiseKernel(context, @@ -90,7 +138,20 @@ def create_a(context, shape, mu=False, pec=False, pmc=False): return spmv -def create_xr_step(context): +def create_xr_step(context: pyopencl.Context) -> operation: + """ + Return a function + xr_update(x, p, r, v, alpha, e) + which performs the operations + x += alpha * p + r -= alpha * v + + after waiting for all in the list e + and returns a list of pyopencl.Event + + :param context: PyOpenCL context + :return: Function for performing x and r updates + """ update_xr_source = ''' x[i] = add(x[i], mul(alpha, p[i])); r[i] = sub(r[i], mul(alpha, v[i])); @@ -110,13 +171,28 @@ def create_xr_step(context): return xr_update -def create_rhoerr_step(context): +def create_rhoerr_step(context: pyopencl.Context) -> operation: + """ + Return a function + ri_update(r, e) + which performs the operations + rho = r * r.conj() + err = r * r + + after waiting for all pyopencl.Event in the list e + and returns a list of pyopencl.Event + + :param context: PyOpenCL context + :return: Function for performing x and r updates + """ + update_ri_source = ''' (double3)(r[i].real * r[i].real, \ r[i].real * r[i].imag, \ r[i].imag * r[i].imag) ''' + # Use a vector type (double3) to make the reduction simpler ri_dtype = pyopencl.array.vec.double3 ri_kernel = ReductionKernel(context, @@ -138,7 +214,19 @@ def create_rhoerr_step(context): return ri_update -def create_p_step(context): +def create_p_step(context: pyopencl.Context) -> operation: + """ + Return a function + p_update(p, r, beta, e) + which performs the operation + p = r + beta * p + + after waiting for all pyopencl.Event in the list e + and returns a list of pyopencl.Event + + :param context: PyOpenCL context + :return: Function for performing the p update + """ update_p_source = ''' p[i] = add(r[i], mul(beta, p[i])); ''' @@ -156,7 +244,16 @@ def create_p_step(context): return p_update -def create_dot(context): +def create_dot(context: pyopencl.Context) -> operation: + """ + Return a function for performing the dot product + p @ v + with the signature + dot(p, v, e) -> float + + :param context: PyOpenCL context + :return: Function for performing the dot product + """ dot_dtype = numpy.complex128 dot_kernel = ReductionKernel(context, @@ -168,14 +265,30 @@ def create_dot(context): reduce_expr='add(a, b)', arguments=ptrs('p', 'v')) - def ri_update(p, v, e): + def dot(p, v, e): g = dot_kernel(p, v, wait_for=e) return g.get() - return ri_update + return dot -def create_a_csr(context): +def create_a_csr(context: pyopencl.Context) -> operation: + """ + Return a function for performing the operation + (N @ v) + where N is stored in CSR (compressed sparse row) format. + + The function signature is + spmv(v_out, m, v_in, e) + where m is an opencl_fdfd.csr.CSRMatrix + and v_out, v_in are (dense) vectors (of type pyopencl.array.Array). + + The function waits on all the pyopencl.Event in e before running, and returns + a list of pyopencl.Event. + + :param context: PyOpenCL context + :return: Function for sparse (M @ v) operation where M is in CSR format + """ spmv_source = ''' int start = m_row_ptr[i]; int stop = m_row_ptr[i+1];