From 40677664784477735b7dc3852e2db9c89c58b487 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 8 Jan 2018 16:16:26 -0800
Subject: [PATCH 1/9] use own CG implementation

---
 examples/bloch.py   |  10 +-
 fdfd_tools/bloch.py | 378 ++++++++++++++++++++++++++++----------------
 2 files changed, 250 insertions(+), 138 deletions(-)

diff --git a/examples/bloch.py b/examples/bloch.py
index 8bbd30e..793bd89 100644
--- a/examples/bloch.py
+++ b/examples/bloch.py
@@ -30,11 +30,11 @@ g2.shifts = numpy.zeros((6,3))
 g2.grids = [numpy.zeros(g.shape) for _ in range(6)]
 
 epsilon = [g.grids[0],] * 3
-reciprocal_lattice = numpy.diag(1e6/numpy.array([x_period, y_period, z_period])) #cols are vectors
+reciprocal_lattice = numpy.diag(1000/numpy.array([x_period, y_period, z_period])) #cols are vectors
 
 #print('Finding k at 1550nm')
-#k, f = bloch.find_k(frequency=1/1550,
-#                    tolerance=(1/1550 - 1/1551),
+#k, f = bloch.find_k(frequency=1000/1550,
+#                    tolerance=(1000 * (1/1550 - 1/1551)),
 #                    direction=[1, 0, 0],
 #                    G_matrix=reciprocal_lattice,
 #                    epsilon=epsilon,
@@ -47,10 +47,10 @@ for k0x in [.25]:
     k0 = numpy.array([k0x, 0, 0])
 
     kmag = norm(reciprocal_lattice @ k0)
-    tolerance = (1e6/1550) * 1e-4/1.5  # df = f * dn_eff / n
+    tolerance = (1000/1550) * 1e-4/1.5  # df = f * dn_eff / n
     logger.info('tolerance {}'.format(tolerance))
 
-    n, v = bloch.eigsolve(4, k0, G_matrix=reciprocal_lattice, epsilon=epsilon, tolerance=tolerance)
+    n, v = bloch.eigsolve(4, k0, G_matrix=reciprocal_lattice, epsilon=epsilon, tolerance=tolerance**2)
     v2e = bloch.hmn_2_exyz(k0, G_matrix=reciprocal_lattice, epsilon=epsilon)
     v2h = bloch.hmn_2_hxyz(k0, G_matrix=reciprocal_lattice, epsilon=epsilon)
     ki = bloch.generate_kmn(k0, reciprocal_lattice, g.shape)
diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index b172e21..31d364d 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -76,6 +76,7 @@ This module contains functions for generating and solving the
 from typing import List, Tuple, Callable, Dict
 import logging
 import numpy
+from numpy import pi, real, trace
 from numpy.fft import fftn, ifftn, fftfreq
 import scipy
 import scipy.optimize
@@ -337,139 +338,6 @@ def inverse_maxwell_operator_approx(k0: numpy.ndarray,
     return operator
 
 
-def eigsolve(num_modes: int,
-             k0: numpy.ndarray,
-             G_matrix: numpy.ndarray,
-             epsilon: field_t,
-             mu: field_t = None,
-             tolerance = 1e-8,
-             ) -> Tuple[numpy.ndarray, numpy.ndarray]:
-    """
-    Find the first (lowest-frequency) num_modes eigenmodes with Bloch wavevector
-     k0 of the specified structure.
-
-    :param k0: Bloch wavevector, [k0x, k0y, k0z].
-    :param G_matrix: 3x3 matrix, with reciprocal lattice vectors as columns.
-    :param epsilon: Dielectric constant distribution for the simulation.
-        All fields are sampled at cell centers (i.e., NOT Yee-gridded)
-    :param mu: Magnetic permability distribution for the simulation.
-        Default None (1 everywhere).
-    :return: (eigenvalues, eigenvectors) where eigenvalues[i] corresponds to the
-        vector eigenvectors[i, :]
-    """
-    h_size = 2 * epsilon[0].size
-
-    kmag = norm(G_matrix @ k0)
-
-    '''
-    Generate the operators
-    '''
-    mop = maxwell_operator(k0=k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
-    imop = inverse_maxwell_operator_approx(k0=k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
-
-    scipy_op = spalg.LinearOperator(dtype=complex, shape=(h_size, h_size), matvec=mop)
-    scipy_iop = spalg.LinearOperator(dtype=complex, shape=(h_size, h_size), matvec=imop)
-
-    y_shape = (h_size, num_modes)
-
-    def rayleigh_quotient(Z: numpy.ndarray, approx_grad: bool = True):
-        """
-        Absolute value of the block Rayleigh quotient, and the associated gradient.
-
-        See Johnson and Joannopoulos, Opt. Expr. 8, 3 (2001) for details (full
-         citation in module docstring).
-
-        ===
-
-        Notes on my understanding of the procedure:
-
-        Minimize f(Y) = |trace((Y.H @ A @ Y)|, making use of Y = Z @ inv(Z.H @ Z)^(1/2)
-         (a polar orthogonalization of Y). This gives f(Z) = |trace(Z.H @ A @ Z @ U)|,
-         where U = inv(Z.H @ Z). We minimize the absolute value to find the eigenvalues
-         with smallest magnitude.
-
-        The gradient is P @ (A @ Z @ U), where P = (1 - Z @ U @ Z.H) is a projection
-         onto the space orthonormal to Z. If approx_grad is True, the approximate
-         inverse of the maxwell operator is used to precondition the gradient.
-        """
-        z = Z.view(dtype=complex).reshape(y_shape)
-        U = numpy.linalg.inv(z.conj().T @ z)
-        zU = z @ U
-        AzU = scipy_op @ zU
-        zTAzU = z.conj().T @ AzU
-        f = numpy.real(numpy.trace(zTAzU))
-        if approx_grad:
-            df_dy = scipy_iop @ (AzU - zU @ zTAzU)
-        else:
-            df_dy = (AzU - zU @ zTAzU)
-        
-        df_dy_flat = df_dy.view(dtype=float).ravel()
-        return numpy.abs(f), numpy.sign(f) * df_dy_flat
-
-    '''
-    Use the conjugate gradient method and the approximate gradient calculation to
-     quickly find approximate eigenvectors.
-    '''
-    result = scipy.optimize.minimize(rayleigh_quotient,
-                                     numpy.random.rand(*y_shape, 2),
-                                     jac=True,
-                                     method='L-BFGS-B',
-                                     tol=1e-20,
-                                     options={'maxiter': 2000, 'gtol':0, 'ftol':1e-20 , 'disp':True})#, 'maxls':80, 'm':30})
-
-
-    result = scipy.optimize.minimize(lambda y: rayleigh_quotient(y, True),
-                                     result.x,
-                                     jac=True,
-                                     method='L-BFGS-B',
-                                     tol=1e-20,
-                                     options={'maxiter': 2000, 'gtol':0, 'disp':True})
-
-    result = scipy.optimize.minimize(lambda y: rayleigh_quotient(y, False),
-                                     result.x,
-                                     jac=True,
-                                     method='L-BFGS-B',
-                                     tol=1e-20,
-                                     options={'maxiter': 2000, 'gtol':0, 'disp':True})
-
-    for i in range(20):
-        result = scipy.optimize.minimize(lambda y: rayleigh_quotient(y, False),
-                                     result.x,
-                                     jac=True,
-                                     method='L-BFGS-B',
-                                     tol=1e-20,
-                                     options={'maxiter': 70, 'gtol':0, 'disp':True})
-        if result.nit == 0:
-            # We took 0 steps, so re-running won't help
-            break
-
-
-    z = result.x.view(dtype=complex).reshape(y_shape)
-
-    '''
-    Recover eigenvectors from Z
-    '''
-    U = numpy.linalg.inv(z.conj().T @ z)
-    y = z @ scipy.linalg.sqrtm(U)
-    w = y.conj().T @ (scipy_op @ y)
-
-    eigvals, w_eigvecs = numpy.linalg.eig(w)
-    eigvecs = y @ w_eigvecs
-
-    for i in range(len(eigvals)):
-        v = eigvecs[:, i]
-        n = eigvals[i]
-        v /= norm(v)
-        eigness = norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )
-        f = numpy.sqrt(-numpy.real(n))
-        df = numpy.sqrt(-numpy.real(n + eigness))
-        neff_err = kmag * (1/df - 1/f)
-        logger.info('eigness {}: {}\n neff_err: {}'.format(i, eigness, neff_err))
-
-    order = numpy.argsort(numpy.abs(eigvals))
-    return eigvals[order], eigvecs.T[order]
-
-
 def find_k(frequency: float,
            tolerance: float,
            direction: numpy.ndarray,
@@ -511,3 +379,247 @@ def find_k(frequency: float,
     return res.x * direction, res.fun + frequency
 
 
+
+def eigsolve(num_modes: int,
+             k0: numpy.ndarray,
+             G_matrix: numpy.ndarray,
+             epsilon: field_t,
+             mu: field_t = None,
+             tolerance = 1e-20,
+             ) -> Tuple[numpy.ndarray, numpy.ndarray]:
+    """
+    Find the first (lowest-frequency) num_modes eigenmodes with Bloch wavevector
+     k0 of the specified structure.
+
+    :param k0: Bloch wavevector, [k0x, k0y, k0z].
+    :param G_matrix: 3x3 matrix, with reciprocal lattice vectors as columns.
+    :param epsilon: Dielectric constant distribution for the simulation.
+        All fields are sampled at cell centers (i.e., NOT Yee-gridded)
+    :param mu: Magnetic permability distribution for the simulation.
+        Default None (1 everywhere).
+    :param tolerance: Solver stops when fractional change in the objective
+        trace(Z.H @ A @ Z @ inv(Z Z.H)) is smaller than the tolerance
+    :return: (eigenvalues, eigenvectors) where eigenvalues[i] corresponds to the
+        vector eigenvectors[i, :]
+    """
+    h_size = 2 * epsilon[0].size
+
+    kmag = norm(G_matrix @ k0)
+
+    '''
+    Generate the operators
+    '''
+    mop = maxwell_operator(k0=k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
+    imop = inverse_maxwell_operator_approx(k0=k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
+
+    scipy_op = spalg.LinearOperator(dtype=complex, shape=(h_size, h_size), matvec=mop)
+    scipy_iop = spalg.LinearOperator(dtype=complex, shape=(h_size, h_size), matvec=imop)
+
+    y_shape = (h_size, num_modes)
+
+    prev_E = 0
+    d_scale = 1
+    prev_traceGtKG = 0
+    prev_theta = 0.5
+    D = numpy.zeros(shape=y_shape, dtype=complex)
+
+    y0 = None
+    if y0 is None:
+        Z = numpy.random.rand(*y_shape).astype(complex)
+    else:
+        Z = y0
+
+    while True:
+        Z2 = Z.conj().T @ Z
+        Z_norm = numpy.sqrt(real(trace(Z2))) / num_modes
+        Z /= Z_norm
+        Z2 /= Z_norm * Z_norm
+        try:
+            U = numpy.linalg.inv(Z2)
+        except numpy.linalg.LinAlgError:
+            Z = numpy.random.rand(*y_shape).astype(complex)
+            continue
+
+        trace_U = real(trace(U))
+        if trace_U > 1e8 * num_modes:
+            Z = Z @ scipy.linalg.sqrtm(U).conj().T
+            prev_traceGtKG = 0
+            continue
+        break
+
+    def rtrace_AtB(A, B):
+        return real(numpy.sum(A.conj() * B))
+
+    def symmetrize(A):
+        return (A + A.conj().T) * 0.5
+
+    max_iters = 10000
+    for iter in range(max_iters):
+        U = numpy.linalg.inv(Z.conj().T @ Z)
+        AZ = scipy_op @ Z
+        AZU = AZ @ U
+        ZtAZU = Z.conj().T @ AZU
+        E = real(trace(ZtAZU))
+        sgn = numpy.sign(E)
+        E = numpy.abs(E)
+        G = (AZU - Z @ U @ ZtAZU) * sgn
+
+        if iter > 0 and abs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7):
+            logging.info('Optimization succeded: {} - 5e-8 < {} * {} / 2'.format(abs(E - prev_E), tolerance, E + prev_E))
+            break
+
+        KG = scipy_iop @ G
+        traceGtKG = rtrace_AtB(G, KG)
+        gamma_numerator = traceGtKG
+
+        reset_iters = 100
+        if prev_traceGtKG == 0 or iter % reset_iters == 0:
+            print('RESET!')
+            gamma = 0
+        else:
+            gamma = gamma_numerator / prev_traceGtKG
+
+        D = gamma * d_scale * D + KG
+        d_scale = numpy.sqrt(rtrace_AtB(D, D)) / num_modes
+        D /= d_scale
+
+        AD = scipy_op @ D
+        DtD = D.conj().T @ D
+        DtAD = D.conj().T @ AD
+
+        ZtD = Z.conj().T @ D
+        ZtAD = Z.conj().T @ AD
+        symZtD = symmetrize(ZtD)
+        symZtAD = symmetrize(ZtAD)
+
+        U_sZtD = U @ symZtD
+
+        dE = 2.0 * (rtrace_AtB(U, symZtAD) - rtrace_AtB(ZtAZU, U_sZtD))
+
+        S2 = DtD - 4 * symZtD @ U_sZtD
+        d2E = 2 * (rtrace_AtB(U, DtAD) -
+                   rtrace_AtB(ZtAZU, U @ S2) -
+               4 * rtrace_AtB(U, symZtAD @ U_sZtD))
+
+        # Newton-Raphson to find a root of the first derivative:
+        theta = -dE/d2E
+
+        if d2E < 0 or abs(theta) >= pi:
+            theta = -abs(prev_theta) * numpy.sign(dE)
+
+        # ZtAZU * ZtZ = ZtAZ for use in line search
+        ZtZ = Z.conj().T @ Z
+        ZtAZ = ZtAZU @ ZtZ.conj().T
+
+        def Qi_func(theta, memo=[None, None]):
+            if memo[0] == theta:
+                return memo[1]
+
+            c = numpy.cos(theta)
+            s = numpy.sin(theta)
+            Q = c*c * ZtZ + s*s * DtD + 2*s*c * symZtD
+            try:
+                Qi = numpy.linalg.inv(Q)
+            except numpy.linalg.LinAlgError:
+                logger.info('taylor Qi')
+                # if c or s small, taylor expand
+                if c < 1e-4 * s and c != 0:
+                    Qi = numpy.linalg.inv(DtD)
+                    Qi = Qi / (s*s) - 2*c/(s*s*s) * (Qi @ symZtD.conj().T @ Qi.conj().T)
+                elif s < 1e-4 * c and s != 0:
+                    Qi = numpy.linalg.inv(ZtZ)
+                    Qi = Qi / (c*c) - 2*s/(c*c*c) * (Qi @ symZtD.conj().T @ Qi.conj().T)
+                else:
+                    raise Exception('Inexplicable singularity in trace_func')
+            memo[0] = theta
+            memo[1] = Qi
+            return Qi
+
+        def trace_func(theta):
+            c = numpy.cos(theta)
+            s = numpy.sin(theta)
+            Qi = Qi_func(theta)
+            R = c*c * ZtAZ + s*s * DtAD + 2*s*c * symZtAD
+            trace = rtrace_AtB(R, Qi)
+            return numpy.abs(trace)
+
+        #def trace_deriv(theta):
+        #    Qi = Qi_func(theta)
+        #    c2 = numpy.cos(2 * theta)
+        #    s2 = numpy.sin(2 * theta)
+        #    F = -0.5*s2 *  (ZtAZ - DtAD) + c2 * symZtAD
+        #    trace_deriv = rtrace_AtB(Qi, F)
+
+        #    G = Qi @ F.conj().T @ Qi.conj().T
+        #    H = -0.5*s2 * (ZtZ - DtD) + c2 * symZtD
+        #    trace_deriv -= rtrace_AtB(G, H)
+
+        #    trace_deriv *= 2
+        #    return trace_deriv * sgn
+
+        '''
+        theta, new_E, new_dE = linmin(theta, E, dE, 0.1, min(tolerance, 1e-6), 1e-14, 0, -numpy.sign(dE) * K_PI, trace_func)
+        '''
+        #theta, n, _, new_E, _, _new_dE = scipy.optimize.line_search(trace_func, trace_deriv, xk=theta, pk=numpy.ones((1,1)), gfk=dE, old_fval=E, c1=min(tolerance, 1e-6), c2=0.1, amax=pi)
+        result = scipy.optimize.minimize_scalar(trace_func, bounds=(0, pi), tol=tolerance)
+        new_E = result.fun
+        theta = result.x
+
+        improvement = numpy.abs(E - new_E) * 2 / numpy.abs(E + new_E)
+        logger.info('linmin improvement {}'.format(improvement))
+        Z *= numpy.cos(theta)
+        Z += D * numpy.sin(theta)
+
+        prev_traceGtKG = traceGtKG
+        prev_theta = theta
+        prev_E = E
+
+    '''
+    Recover eigenvectors from Z
+    '''
+    U = numpy.linalg.inv(Z.conj().T @ Z)
+    Y = Z @ scipy.linalg.sqrtm(U)
+    W = Y.conj().T @ (scipy_op @ Y)
+
+    eigvals, W_eigvecs = numpy.linalg.eig(W)
+    eigvecs = Y @ W_eigvecs
+
+    for i in range(len(eigvals)):
+        v = eigvecs[:, i]
+        n = eigvals[i]
+        v /= norm(v)
+        eigness = norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )
+        f = numpy.sqrt(-numpy.real(n))
+        df = numpy.sqrt(-numpy.real(n + eigness))
+        neff_err = kmag * (1/df - 1/f)
+        logger.info('eigness {}: {}\n neff_err: {}'.format(i, eigness, neff_err))
+
+    order = numpy.argsort(numpy.abs(eigvals))
+    return eigvals[order], eigvecs.T[order]
+
+    #def linmin(x_guess, f0, df0, x_max, f_tol=0.1, df_tol=min(tolerance, 1e-6), x_tol=1e-14, x_min=0, linmin_func):
+    #    if df0 > 0:
+    #        x0, f0, df0 = linmin(-x_guess, f0, -df0, -x_max, f_tol, df_tol, x_tol, -x_min, lambda q, dq: -linmin_func(q, dq))
+    #        return -x0, f0, -df0
+    #    elif df0 == 0:
+    #        return 0, f0, df0
+    #    else:
+    #        x = x_guess
+    #        fx = f0
+    #        dfx = df0
+
+    #        isave = numpy.zeros((2,), numpy.intc)
+    #        dsave = numpy.zeros((13,), float)
+
+    #        x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
+    #                                          x_min, x_max, isave, dsave)
+    #        for i in range(int(1e6)):
+    #            if task != 'F':
+    #                logging.info('search converged in {} iterations'.format(i))
+    #                break
+    #            fx = f(x, dfx)
+    #            x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
+    #                                              x_min, x_max, isave, dsave)
+
+    #        return x, fx, dfx
+

From c4cbdff751c19716b788c582ac6d4df0af5d3576 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 8 Jan 2018 23:28:57 -0800
Subject: [PATCH 2/9] cleanup

---
 fdfd_tools/bloch.py | 125 ++++++++++++++++++++++----------------------
 1 file changed, 63 insertions(+), 62 deletions(-)

diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index 31d364d..de070c0 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -447,15 +447,10 @@ def eigsolve(num_modes: int,
             continue
         break
 
-    def rtrace_AtB(A, B):
-        return real(numpy.sum(A.conj() * B))
-
-    def symmetrize(A):
-        return (A + A.conj().T) * 0.5
-
     max_iters = 10000
     for iter in range(max_iters):
-        U = numpy.linalg.inv(Z.conj().T @ Z)
+        ZtZ = Z.conj().T @ Z
+        U = numpy.linalg.inv(ZtZ)
         AZ = scipy_op @ Z
         AZU = AZ @ U
         ZtAZU = Z.conj().T @ AZU
@@ -469,47 +464,44 @@ def eigsolve(num_modes: int,
             break
 
         KG = scipy_iop @ G
-        traceGtKG = rtrace_AtB(G, KG)
-        gamma_numerator = traceGtKG
+        traceGtKG = _rtrace_AtB(G, KG)
 
-        reset_iters = 100
+        reset_iters = 100    # TODO
         if prev_traceGtKG == 0 or iter % reset_iters == 0:
-            print('RESET!')
+            logger.inf('CG reset')
             gamma = 0
         else:
-            gamma = gamma_numerator / prev_traceGtKG
+            gamma = traceGtKG / prev_traceGtKG
 
         D = gamma * d_scale * D + KG
-        d_scale = numpy.sqrt(rtrace_AtB(D, D)) / num_modes
+        d_scale = numpy.sqrt(_rtrace_AtB(D, D)) / num_modes
         D /= d_scale
 
+        ZtAZ = Z.conj().T @ AZ
+
         AD = scipy_op @ D
         DtD = D.conj().T @ D
         DtAD = D.conj().T @ AD
 
-        ZtD = Z.conj().T @ D
-        ZtAD = Z.conj().T @ AD
-        symZtD = symmetrize(ZtD)
-        symZtAD = symmetrize(ZtAD)
+        symZtD = _symmetrize(Z.conj().T @ D)
+        symZtAD = _symmetrize(Z.conj().T @ AD)
 
+        '''
         U_sZtD = U @ symZtD
 
-        dE = 2.0 * (rtrace_AtB(U, symZtAD) - rtrace_AtB(ZtAZU, U_sZtD))
+        dE = 2.0 * (_rtrace_AtB(U, symZtAD) -
+                    _rtrace_AtB(ZtAZU, U_sZtD))
 
-        S2 = DtD - 4 * symZtD @ U_sZtD
-        d2E = 2 * (rtrace_AtB(U, DtAD) -
-                   rtrace_AtB(ZtAZU, U @ S2) -
-               4 * rtrace_AtB(U, symZtAD @ U_sZtD))
+        d2E = 2 * (_rtrace_AtB(U, DtAD) -
+                   _rtrace_AtB(ZtAZU, U @ (DtD - 4 * symZtD @ U_sZtD)) -
+               4 * _rtrace_AtB(U, symZtAD @ U_sZtD))
 
         # Newton-Raphson to find a root of the first derivative:
         theta = -dE/d2E
 
         if d2E < 0 or abs(theta) >= pi:
             theta = -abs(prev_theta) * numpy.sign(dE)
-
-        # ZtAZU * ZtZ = ZtAZ for use in line search
-        ZtZ = Z.conj().T @ Z
-        ZtAZ = ZtAZU @ ZtZ.conj().T
+        '''
 
         def Qi_func(theta, memo=[None, None]):
             if memo[0] == theta:
@@ -525,10 +517,10 @@ def eigsolve(num_modes: int,
                 # if c or s small, taylor expand
                 if c < 1e-4 * s and c != 0:
                     Qi = numpy.linalg.inv(DtD)
-                    Qi = Qi / (s*s) - 2*c/(s*s*s) * (Qi @ symZtD.conj().T @ Qi.conj().T)
+                    Qi = Qi / (s*s) - 2*c/(s*s*s) * (Qi @ (Qi @ symZtD).conj().T)
                 elif s < 1e-4 * c and s != 0:
                     Qi = numpy.linalg.inv(ZtZ)
-                    Qi = Qi / (c*c) - 2*s/(c*c*c) * (Qi @ symZtD.conj().T @ Qi.conj().T)
+                    Qi = Qi / (c*c) - 2*s/(c*c*c) * (Qi @ (Qi @ symZtD).conj().T)
                 else:
                     raise Exception('Inexplicable singularity in trace_func')
             memo[0] = theta
@@ -540,22 +532,24 @@ def eigsolve(num_modes: int,
             s = numpy.sin(theta)
             Qi = Qi_func(theta)
             R = c*c * ZtAZ + s*s * DtAD + 2*s*c * symZtAD
-            trace = rtrace_AtB(R, Qi)
+            trace = _rtrace_AtB(R, Qi)
             return numpy.abs(trace)
 
-        #def trace_deriv(theta):
-        #    Qi = Qi_func(theta)
-        #    c2 = numpy.cos(2 * theta)
-        #    s2 = numpy.sin(2 * theta)
-        #    F = -0.5*s2 *  (ZtAZ - DtAD) + c2 * symZtAD
-        #    trace_deriv = rtrace_AtB(Qi, F)
+        '''
+        def trace_deriv(theta):
+            Qi = Qi_func(theta)
+            c2 = numpy.cos(2 * theta)
+            s2 = numpy.sin(2 * theta)
+            F = -0.5*s2 *  (ZtAZ - DtAD) + c2 * symZtAD
+            trace_deriv = _rtrace_AtB(Qi, F)
 
-        #    G = Qi @ F.conj().T @ Qi.conj().T
-        #    H = -0.5*s2 * (ZtZ - DtD) + c2 * symZtD
-        #    trace_deriv -= rtrace_AtB(G, H)
+            G = Qi @ F.conj().T @ Qi.conj().T
+            H = -0.5*s2 * (ZtZ - DtD) + c2 * symZtD
+            trace_deriv -= _rtrace_AtB(G, H)
 
-        #    trace_deriv *= 2
-        #    return trace_deriv * sgn
+            trace_deriv *= 2
+            return trace_deriv * sgn
+        '''
 
         '''
         theta, new_E, new_dE = linmin(theta, E, dE, 0.1, min(tolerance, 1e-6), 1e-14, 0, -numpy.sign(dE) * K_PI, trace_func)
@@ -597,29 +591,36 @@ def eigsolve(num_modes: int,
     order = numpy.argsort(numpy.abs(eigvals))
     return eigvals[order], eigvecs.T[order]
 
-    #def linmin(x_guess, f0, df0, x_max, f_tol=0.1, df_tol=min(tolerance, 1e-6), x_tol=1e-14, x_min=0, linmin_func):
-    #    if df0 > 0:
-    #        x0, f0, df0 = linmin(-x_guess, f0, -df0, -x_max, f_tol, df_tol, x_tol, -x_min, lambda q, dq: -linmin_func(q, dq))
-    #        return -x0, f0, -df0
-    #    elif df0 == 0:
-    #        return 0, f0, df0
-    #    else:
-    #        x = x_guess
-    #        fx = f0
-    #        dfx = df0
+#def linmin(x_guess, f0, df0, x_max, f_tol=0.1, df_tol=min(tolerance, 1e-6), x_tol=1e-14, x_min=0, linmin_func):
+#    if df0 > 0:
+#        x0, f0, df0 = linmin(-x_guess, f0, -df0, -x_max, f_tol, df_tol, x_tol, -x_min, lambda q, dq: -linmin_func(q, dq))
+#        return -x0, f0, -df0
+#    elif df0 == 0:
+#        return 0, f0, df0
+#    else:
+#        x = x_guess
+#        fx = f0
+#        dfx = df0
 
-    #        isave = numpy.zeros((2,), numpy.intc)
-    #        dsave = numpy.zeros((13,), float)
+#        isave = numpy.zeros((2,), numpy.intc)
+#        dsave = numpy.zeros((13,), float)
 
-    #        x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
-    #                                          x_min, x_max, isave, dsave)
-    #        for i in range(int(1e6)):
-    #            if task != 'F':
-    #                logging.info('search converged in {} iterations'.format(i))
-    #                break
-    #            fx = f(x, dfx)
-    #            x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
-    #                                              x_min, x_max, isave, dsave)
+#        x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
+#                                          x_min, x_max, isave, dsave)
+#        for i in range(int(1e6)):
+#            if task != 'F':
+#                logging.info('search converged in {} iterations'.format(i))
+#                break
+#            fx = f(x, dfx)
+#            x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
+#                                              x_min, x_max, isave, dsave)
 
-    #        return x, fx, dfx
+#        return x, fx, dfx
+
+
+def _rtrace_AtB(A, B):
+    return real(numpy.sum(A.conj() * B))
+
+def _symmetrize(A):
+    return (A + A.conj().T) * 0.5
 

From e02040c7093de5894ad1857a7fae5569d1bfcaee Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 8 Jan 2018 23:33:22 -0800
Subject: [PATCH 3/9] fixes and clarification

---
 fdfd_tools/bloch.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index de070c0..ea9e760 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -385,7 +385,9 @@ def eigsolve(num_modes: int,
              G_matrix: numpy.ndarray,
              epsilon: field_t,
              mu: field_t = None,
-             tolerance = 1e-20,
+             tolerance: float = 1e-20,
+             max_iters: int = 10000,
+             reset_iters: int = 100,
              ) -> Tuple[numpy.ndarray, numpy.ndarray]:
     """
     Find the first (lowest-frequency) num_modes eigenmodes with Bloch wavevector
@@ -447,16 +449,15 @@ def eigsolve(num_modes: int,
             continue
         break
 
-    max_iters = 10000
     for iter in range(max_iters):
         ZtZ = Z.conj().T @ Z
         U = numpy.linalg.inv(ZtZ)
         AZ = scipy_op @ Z
         AZU = AZ @ U
         ZtAZU = Z.conj().T @ AZU
-        E = real(trace(ZtAZU))
-        sgn = numpy.sign(E)
-        E = numpy.abs(E)
+        E_signed = real(trace(ZtAZU))
+        sgn = numpy.sign(E_signed)
+        E = numpy.abs(E_signed)
         G = (AZU - Z @ U @ ZtAZU) * sgn
 
         if iter > 0 and abs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7):
@@ -466,9 +467,8 @@ def eigsolve(num_modes: int,
         KG = scipy_iop @ G
         traceGtKG = _rtrace_AtB(G, KG)
 
-        reset_iters = 100    # TODO
         if prev_traceGtKG == 0 or iter % reset_iters == 0:
-            logger.inf('CG reset')
+            logger.info('CG reset')
             gamma = 0
         else:
             gamma = traceGtKG / prev_traceGtKG

From 0e47fdd5fb97da65bc211b83ce1c3f8cabf3b985 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Tue, 9 Jan 2018 00:00:58 -0800
Subject: [PATCH 4/9] randomize imaginary part of starting vector

---
 fdfd_tools/bloch.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index ea9e760..168a349 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -422,24 +422,24 @@ def eigsolve(num_modes: int,
     prev_E = 0
     d_scale = 1
     prev_traceGtKG = 0
-    prev_theta = 0.5
+    #prev_theta = 0.5
     D = numpy.zeros(shape=y_shape, dtype=complex)
 
     y0 = None
     if y0 is None:
-        Z = numpy.random.rand(*y_shape).astype(complex)
+        Z = numpy.random.rand(*y_shape) + 1j * numpy.random.rand(*y_shape)
     else:
         Z = y0
 
     while True:
-        Z2 = Z.conj().T @ Z
-        Z_norm = numpy.sqrt(real(trace(Z2))) / num_modes
+        ZtZ = Z.conj().T @ Z
+        Z_norm = numpy.sqrt(real(trace(ZtZ))) / num_modes
         Z /= Z_norm
-        Z2 /= Z_norm * Z_norm
+        ZtZ /= Z_norm * Z_norm
         try:
-            U = numpy.linalg.inv(Z2)
+            U = numpy.linalg.inv(ZtZ)
         except numpy.linalg.LinAlgError:
-            Z = numpy.random.rand(*y_shape).astype(complex)
+            Z = numpy.random.rand(*y_shape) + 1j * numpy.random.rand(*y_shape)
             continue
 
         trace_U = real(trace(U))
@@ -565,7 +565,7 @@ def eigsolve(num_modes: int,
         Z += D * numpy.sin(theta)
 
         prev_traceGtKG = traceGtKG
-        prev_theta = theta
+        #prev_theta = theta
         prev_E = E
 
     '''

From e8f836c908996fa80f866174f3ff7a1dc7ebb8db Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 15 Jan 2018 22:43:33 -0800
Subject: [PATCH 5/9] Cleanup

---
 fdfd_tools/bloch.py | 106 +++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 56 deletions(-)

diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index 168a349..1816bdc 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -73,7 +73,7 @@ This module contains functions for generating and solving the
 
 '''
 
-from typing import List, Tuple, Callable, Dict
+from typing import Tuple, Callable
 import logging
 import numpy
 from numpy import pi, real, trace
@@ -83,7 +83,6 @@ import scipy.optimize
 from scipy.linalg import norm
 import scipy.sparse.linalg as spalg
 
-from .eigensolvers import rayleigh_quotient_iteration
 from . import field_t
 
 logger = logging.getLogger(__name__)
@@ -256,7 +255,7 @@ def hmn_2_hxyz(k0: numpy.ndarray,
     :return: Function for converting h_mn into H_xyz
     """
     shape = epsilon[0].shape + (1,)
-    k_mag, m, n = generate_kmn(k0, G_matrix, shape)
+    _k_mag, m, n = generate_kmn(k0, G_matrix, shape)
 
     def operator(h: numpy.ndarray):
         hin_m, hin_n = [hi.reshape(shape) for hi in numpy.split(h, 2)]
@@ -379,7 +378,6 @@ def find_k(frequency: float,
     return res.x * direction, res.fun + frequency
 
 
-
 def eigsolve(num_modes: int,
              k0: numpy.ndarray,
              G_matrix: numpy.ndarray,
@@ -432,10 +430,8 @@ def eigsolve(num_modes: int,
         Z = y0
 
     while True:
+        Z *= num_modes / norm(Z)
         ZtZ = Z.conj().T @ Z
-        Z_norm = numpy.sqrt(real(trace(ZtZ))) / num_modes
-        Z /= Z_norm
-        ZtZ /= Z_norm * Z_norm
         try:
             U = numpy.linalg.inv(ZtZ)
         except numpy.linalg.LinAlgError:
@@ -449,7 +445,7 @@ def eigsolve(num_modes: int,
             continue
         break
 
-    for iter in range(max_iters):
+    for i in range(max_iters):
         ZtZ = Z.conj().T @ Z
         U = numpy.linalg.inv(ZtZ)
         AZ = scipy_op @ Z
@@ -460,22 +456,22 @@ def eigsolve(num_modes: int,
         E = numpy.abs(E_signed)
         G = (AZU - Z @ U @ ZtAZU) * sgn
 
-        if iter > 0 and abs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7):
+        if i > 0 and abs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7):
             logging.info('Optimization succeded: {} - 5e-8 < {} * {} / 2'.format(abs(E - prev_E), tolerance, E + prev_E))
             break
 
         KG = scipy_iop @ G
         traceGtKG = _rtrace_AtB(G, KG)
 
-        if prev_traceGtKG == 0 or iter % reset_iters == 0:
+        if prev_traceGtKG == 0 or i % reset_iters == 0:
             logger.info('CG reset')
             gamma = 0
         else:
             gamma = traceGtKG / prev_traceGtKG
 
-        D = gamma * d_scale * D + KG
-        d_scale = numpy.sqrt(_rtrace_AtB(D, D)) / num_modes
-        D /= d_scale
+        D = gamma / d_scale * D + KG
+        d_scale = num_modes / norm(D)
+        D *= d_scale
 
         ZtAZ = Z.conj().T @ AZ
 
@@ -486,22 +482,6 @@ def eigsolve(num_modes: int,
         symZtD = _symmetrize(Z.conj().T @ D)
         symZtAD = _symmetrize(Z.conj().T @ AD)
 
-        '''
-        U_sZtD = U @ symZtD
-
-        dE = 2.0 * (_rtrace_AtB(U, symZtAD) -
-                    _rtrace_AtB(ZtAZU, U_sZtD))
-
-        d2E = 2 * (_rtrace_AtB(U, DtAD) -
-                   _rtrace_AtB(ZtAZU, U @ (DtD - 4 * symZtD @ U_sZtD)) -
-               4 * _rtrace_AtB(U, symZtAD @ U_sZtD))
-
-        # Newton-Raphson to find a root of the first derivative:
-        theta = -dE/d2E
-
-        if d2E < 0 or abs(theta) >= pi:
-            theta = -abs(prev_theta) * numpy.sign(dE)
-        '''
 
         def Qi_func(theta, memo=[None, None]):
             if memo[0] == theta:
@@ -549,12 +529,25 @@ def eigsolve(num_modes: int,
 
             trace_deriv *= 2
             return trace_deriv * sgn
-        '''
 
+        U_sZtD = U @ symZtD
+
+        dE = 2.0 * (_rtrace_AtB(U, symZtAD) -
+                    _rtrace_AtB(ZtAZU, U_sZtD))
+
+        d2E = 2 * (_rtrace_AtB(U, DtAD) -
+                   _rtrace_AtB(ZtAZU, U @ (DtD - 4 * symZtD @ U_sZtD)) -
+               4 * _rtrace_AtB(U, symZtAD @ U_sZtD))
+
+        # Newton-Raphson to find a root of the first derivative:
+        theta = -dE/d2E
+
+        if d2E < 0 or abs(theta) >= pi:
+            theta = -abs(prev_theta) * numpy.sign(dE)
+
+        # theta, new_E, new_dE = linmin(theta, E, dE, 0.1, min(tolerance, 1e-6), 1e-14, 0, -numpy.sign(dE) * K_PI, trace_func)
+        theta, n, _, new_E, _, _new_dE = scipy.optimize.line_search(trace_func, trace_deriv, xk=theta, pk=numpy.ones((1,1)), gfk=dE, old_fval=E, c1=min(tolerance, 1e-6), c2=0.1, amax=pi)
         '''
-        theta, new_E, new_dE = linmin(theta, E, dE, 0.1, min(tolerance, 1e-6), 1e-14, 0, -numpy.sign(dE) * K_PI, trace_func)
-        '''
-        #theta, n, _, new_E, _, _new_dE = scipy.optimize.line_search(trace_func, trace_deriv, xk=theta, pk=numpy.ones((1,1)), gfk=dE, old_fval=E, c1=min(tolerance, 1e-6), c2=0.1, amax=pi)
         result = scipy.optimize.minimize_scalar(trace_func, bounds=(0, pi), tol=tolerance)
         new_E = result.fun
         theta = result.x
@@ -591,32 +584,33 @@ def eigsolve(num_modes: int,
     order = numpy.argsort(numpy.abs(eigvals))
     return eigvals[order], eigvecs.T[order]
 
-#def linmin(x_guess, f0, df0, x_max, f_tol=0.1, df_tol=min(tolerance, 1e-6), x_tol=1e-14, x_min=0, linmin_func):
-#    if df0 > 0:
-#        x0, f0, df0 = linmin(-x_guess, f0, -df0, -x_max, f_tol, df_tol, x_tol, -x_min, lambda q, dq: -linmin_func(q, dq))
-#        return -x0, f0, -df0
-#    elif df0 == 0:
-#        return 0, f0, df0
-#    else:
-#        x = x_guess
-#        fx = f0
-#        dfx = df0
+'''
+def linmin(x_guess, f0, df0, x_max, f_tol=0.1, df_tol=min(tolerance, 1e-6), x_tol=1e-14, x_min=0, linmin_func):
+    if df0 > 0:
+        x0, f0, df0 = linmin(-x_guess, f0, -df0, -x_max, f_tol, df_tol, x_tol, -x_min, lambda q, dq: -linmin_func(q, dq))
+        return -x0, f0, -df0
+    elif df0 == 0:
+        return 0, f0, df0
+    else:
+        x = x_guess
+        fx = f0
+        dfx = df0
 
-#        isave = numpy.zeros((2,), numpy.intc)
-#        dsave = numpy.zeros((13,), float)
+        isave = numpy.zeros((2,), numpy.intc)
+        dsave = numpy.zeros((13,), float)
 
-#        x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
-#                                          x_min, x_max, isave, dsave)
-#        for i in range(int(1e6)):
-#            if task != 'F':
-#                logging.info('search converged in {} iterations'.format(i))
-#                break
-#            fx = f(x, dfx)
-#            x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
-#                                              x_min, x_max, isave, dsave)
-
-#        return x, fx, dfx
+        x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
+                                          x_min, x_max, isave, dsave)
+        for i in range(int(1e6)):
+            if task != 'F':
+                logging.info('search converged in {} iterations'.format(i))
+                break
+            fx = f(x, dfx)
+            x, fx, dfx, task = minpack2.dsrch(x, fx, dfx, f_tol, df_tol, x_tol, task,
+                                              x_min, x_max, isave, dsave)
 
+        return x, fx, dfx
+'''
 
 def _rtrace_AtB(A, B):
     return real(numpy.sum(A.conj() * B))

From c1f65f61c1db4d45a5d2df0233b196aa028a23e1 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 15 Jan 2018 22:43:59 -0800
Subject: [PATCH 6/9] Use pyfftw if available

---
 fdfd_tools/bloch.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index 1816bdc..fd5f758 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -77,7 +77,7 @@ from typing import Tuple, Callable
 import logging
 import numpy
 from numpy import pi, real, trace
-from numpy.fft import fftn, ifftn, fftfreq
+from numpy.fft import fftfreq
 import scipy
 import scipy.optimize
 from scipy.linalg import norm
@@ -88,6 +88,29 @@ from . import field_t
 logger = logging.getLogger(__name__)
 
 
+try:
+    import pyfftw.interfaces.numpy_fft
+    import pyfftw.interfaces
+    import multiprocessing
+
+    pyfftw.interfaces.cache.enable()
+    pyfftw.interfaces.cache.set_keepalive_time(3600)
+    fftw_args = {
+        'threads': multiprocessing.cpu_count(),
+        'overwrite_input': True,
+        'planner_effort': 'FFTW_EXHAUSTIVE',
+        }
+
+    def fftn(*args, **kwargs):
+        return pyfftw.interfaces.numpy_fft.fftn(*args, **kwargs, **fftw_args)
+
+    def ifftn(*args, **kwargs):
+        return pyfftw.interfaces.numpy_fft.ifftn(*args, **kwargs, **fftw_args)
+
+except ImportError:
+    from numpy.fft import fftn, ifftn
+
+
 def generate_kmn(k0: numpy.ndarray,
                  G_matrix: numpy.ndarray,
                  shape: numpy.ndarray

From ee9abb77d9f42e79074357e95aee960e7588a566 Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 15 Jan 2018 22:44:14 -0800
Subject: [PATCH 7/9] Fix approx_inverse operator

---
 fdfd_tools/bloch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index fd5f758..aabc96c 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -352,8 +352,8 @@ def inverse_maxwell_operator_approx(k0: numpy.ndarray,
         d_xyz = fftn(ifftn(e_xyz, axes=range(3)) * epsilon, axes=range(3))
 
         # cross product and transform into mn basis   crossinv_t2c
-        h_m = numpy.sum(e_xyz * n, axis=3)[:, :, :, None] / +k_mag
-        h_n = numpy.sum(e_xyz * m, axis=3)[:, :, :, None] / -k_mag
+        h_m = numpy.sum(d_xyz * n, axis=3)[:, :, :, None] / +k_mag
+        h_n = numpy.sum(d_xyz * m, axis=3)[:, :, :, None] / -k_mag
 
         return numpy.hstack((h_m.ravel(), h_n.ravel()))
 

From 323bcf88ad224f5a77df7f4e3d4c25e46055ffcf Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 15 Jan 2018 22:44:26 -0800
Subject: [PATCH 8/9] Propagate mu correctly

---
 fdfd_tools/bloch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index aabc96c..cbfdf1c 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -389,7 +389,7 @@ def find_k(frequency: float,
 
     def get_f(k0_mag: float, band: int = 0):
         k0 = direction * k0_mag
-        n, _v = eigsolve(band + 1, k0, G_matrix=G_matrix, epsilon=epsilon)
+        n, _v = eigsolve(band + 1, k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
         f = numpy.sqrt(numpy.abs(numpy.real(n[band])))
         return f
 

From 1f9a9949c06ff33a836b05e2b18f18b1495e39fc Mon Sep 17 00:00:00 2001
From: jan <jan@mpxd.net>
Date: Mon, 15 Jan 2018 22:44:59 -0800
Subject: [PATCH 9/9] Clarify memo and cleanup

---
 fdfd_tools/bloch.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index cbfdf1c..002234f 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -505,10 +505,11 @@ def eigsolve(num_modes: int,
         symZtD = _symmetrize(Z.conj().T @ D)
         symZtAD = _symmetrize(Z.conj().T @ AD)
 
-
-        def Qi_func(theta, memo=[None, None]):
-            if memo[0] == theta:
-                return memo[1]
+        Qi_memo = [None, None]
+        def Qi_func(theta):
+            nonlocal Qi_memo
+            if Qi_memo[0] == theta:
+                return Qi_memo[1]
 
             c = numpy.cos(theta)
             s = numpy.sin(theta)
@@ -519,15 +520,15 @@ def eigsolve(num_modes: int,
                 logger.info('taylor Qi')
                 # if c or s small, taylor expand
                 if c < 1e-4 * s and c != 0:
-                    Qi = numpy.linalg.inv(DtD)
-                    Qi = Qi / (s*s) - 2*c/(s*s*s) * (Qi @ (Qi @ symZtD).conj().T)
+                    DtDi = numpy.linalg.inv(DtD)
+                    Qi = DtDi / (s*s) - 2*c/(s*s*s) * (DtDi @ (DtDi @ symZtD).conj().T)
                 elif s < 1e-4 * c and s != 0:
-                    Qi = numpy.linalg.inv(ZtZ)
-                    Qi = Qi / (c*c) - 2*s/(c*c*c) * (Qi @ (Qi @ symZtD).conj().T)
+                    ZtZi = numpy.linalg.inv(ZtZ)
+                    Qi = ZtZi / (c*c) - 2*s/(c*c*c) * (ZtZi @ (ZtZi @ symZtD).conj().T)
                 else:
                     raise Exception('Inexplicable singularity in trace_func')
-            memo[0] = theta
-            memo[1] = Qi
+            Qi_memo[0] = theta
+            Qi_memo[1] = Qi
             return Qi
 
         def trace_func(theta):