"""
Default FDFD solver

This file holds the default FDFD solver, which uses an E-field wave
operator implemented directly as OpenCL arithmetic (rather than as
a matrix).
"""

from typing import List
import time

import numpy
from numpy.linalg import norm
import pyopencl
import pyopencl.array

import fdfd_tools.operators

from . import ops

__author__ = 'Jan Petykiewicz'


def cg_solver(omega: complex,
              dxes: List[List[numpy.ndarray]],
              J: numpy.ndarray,
              epsilon: numpy.ndarray,
              mu: numpy.ndarray = None,
              pec: numpy.ndarray = None,
              pmc: numpy.ndarray = None,
              adjoint: bool = False,
              max_iters: int = 40000,
              err_threshold: float = 1e-6,
              context: pyopencl.Context = None,
              verbose: bool = False,
              ) -> numpy.ndarray:
    """
    OpenCL FDFD solver using the iterative conjugate gradient (cg) method
     and implementing the diagonalized E-field wave operator directly in
     OpenCL.

    All ndarray arguments should be 1D arrays. To linearize a list of 3 3D ndarrays,
     either use fdfd_tools.vec() or numpy:
     f_1D = numpy.hstack(tuple((fi.flatten(order='F') for fi in [f_x, f_y, f_z])))

    :param omega: Complex frequency to solve at.
    :param dxes: [[dx_e, dy_e, dz_e], [dx_h, dy_h, dz_h]] (complex cell sizes)
    :param J: Electric current distribution (at E-field locations)
    :param epsilon: Dielectric constant distribution (at E-field locations)
    :param mu: Magnetic permeability distribution (at H-field locations)
    :param pec: Perfect electric conductor distribution
        (at E-field locations; non-zero value indicates PEC is present)
    :param pmc: Perfect magnetic conductor distribution
        (at H-field locations; non-zero value indicates PMC is present)
    :param adjoint: If true, solves the adjoint problem.
    :param max_iters: Maximum number of iterations. Default 40,000.
    :param err_threshold: If (r @ r.conj()) / norm(1j * omega * J) < err_threshold, success.
        Default 1e-6.
    :param context: PyOpenCL context to run in. If not given, construct a new context.
    :param verbose: If True, print progress to stdout. Default False.
    :return: E-field which solves the system. Returned even if we did not converge.
    """

    start_time = time.perf_counter()

    b = -1j * omega * J

    shape = [d.size for d in dxes[0]]

    '''
        ** In this comment, I use the following notation:
           M* = conj(M),
           M.T = transpose(M),
           M' = ctranspose(M),
           M N = dot(M, N)

        This solver uses a symmetrized wave operator M = (L A R) = (L A R).T
         (where L = inv(R) are diagonal preconditioner matrices) when
         solving the wave equation; therefore, it solves the problem
            M y = d
         => (L A R) (inv(R) x) = (L b)
         => A x = b
         with x = R y

        From the fact that M is symmetric, we can write
         (L A R)* = M* = M' = (L A R)' = R' A' L' = R* A' L*
        We obtain M* by conjugating all of our arguments (except J).

        Then we solve
         (R* A' L*) v = (R* b)
        and obtain x:
          x = L* v

        We can accomplish all this simply by conjugating everything (except J) and
         reversing the order of L and R
    '''
    if adjoint:
        # Conjugate everything
        dxes = [[numpy.conj(d) for d in dd] for dd in dxes]
        omega = numpy.conj(omega)
        epsilon = numpy.conj(epsilon)
        if mu is not None:
            mu = numpy.conj(mu)

    L, R = fdfd_tools.operators.e_full_preconditioners(dxes)

    if adjoint:
        b_preconditioned = R @ b
    else:
        b_preconditioned = L @ b

    '''
        Allocate GPU memory and load in data
    '''
    if context is None:
        context = pyopencl.create_some_context(False)

    queue = pyopencl.CommandQueue(context)

    def load_field(v, dtype=numpy.complex128):
        return pyopencl.array.to_device(queue, v.astype(dtype))

    r = load_field(b_preconditioned)  # load preconditioned b into r
    H = pyopencl.array.zeros_like(r)
    x = pyopencl.array.zeros_like(r)
    v = pyopencl.array.zeros_like(r)
    p = pyopencl.array.zeros_like(r)

    alpha = 1.0 + 0j
    rho = 1.0 + 0j
    errs = []

    inv_dxes = [[load_field(1 / d) for d in dd] for dd in dxes]
    oeps = load_field(-omega ** 2 * epsilon)
    Pl = load_field(L.diagonal())
    Pr = load_field(R.diagonal())

    if mu is None:
        invm = load_field(numpy.array([]))
    else:
        invm = load_field(1 / mu)

    if pec is None:
        gpec = load_field(numpy.array([]), dtype=numpy.int8)
    else:
        gpec = load_field(pec.astype(bool), dtype=numpy.int8)

    if pmc is None:
        gpmc = load_field(numpy.array([]), dtype=numpy.int8)
    else:
        gpmc = load_field(pmc.astype(bool), dtype=numpy.int8)

    '''
    Generate OpenCL kernels
    '''
    has_mu, has_pec, has_pmc = [q is not None for q in (mu, pec, pmc)]

    a_step_full = ops.create_a(context, shape, has_mu, has_pec, has_pmc)
    xr_step = ops.create_xr_step(context)
    rhoerr_step = ops.create_rhoerr_step(context)
    p_step = ops.create_p_step(context)
    dot = ops.create_dot(context)

    def a_step(E, H, p, events):
        return a_step_full(E, H, p, inv_dxes, oeps, invm, gpec, gpmc, Pl, Pr, events)

    '''
    Start the solve
    '''
    start_time2 = time.perf_counter()

    _, err2 = rhoerr_step(r, [])
    b_norm = numpy.sqrt(err2)
    print('b_norm check: ', b_norm)

    success = False
    for k in range(max_iters):
        if verbose:
            print('[{:06d}] rho {:.4} alpha {:4.4}'.format(k, rho, alpha), end=' ')

        rho_prev = rho
        e = xr_step(x, p, r, v, alpha, [])
        rho, err2 = rhoerr_step(r, e)

        errs += [numpy.sqrt(err2) / b_norm]

        if verbose:
            print('err', errs[-1])

        if errs[-1] < err_threshold:
            success = True
            break

        e = p_step(p, r, rho/rho_prev, [])
        e = a_step(v, H, p, e)
        alpha = rho / dot(p, v, e)

        if k % 1000 == 0:
            print(k)

    '''
    Done solving
    '''
    time_elapsed = time.perf_counter() - start_time

    # Undo preconditioners
    if adjoint:
        x = (Pl * x).get()
    else:
        x = (Pr * x).get()

    if success:
        print('Success', end='')
    else:
        print('Failure', end=', ')
    print(', {} iterations in {} sec: {} iterations/sec \
                  '.format(k, time_elapsed, k / time_elapsed))
    print('final error', errs[-1])
    print('overhead {} sec'.format(start_time2 - start_time))

    A0 = fdfd_tools.operators.e_full(omega, dxes, epsilon, mu).tocsr()
    if adjoint:
        # Remember we conjugated all the contents of A earlier
        A0 = A0.T
    print('Post-everything residual:', norm(A0 @ x - b) / norm(b))
    return x