working version"

2017-12-18 22:21:24 -08:00 · 2017-12-18 22:21:24 -08:00 · ea2a6a66c5
commit ea2a6a66c5
parent 16f97d7f6b
3 changed files with 730 additions and 9 deletions
--- a/examples/bloch.py
+++ b/examples/bloch.py
@ -0,0 +1,68 @@
 import numpy, scipy, gridlock, fdfd_tools
 from fdfd_tools import bloch
 from numpy.linalg import norm
 import logging
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 dx = 40
 x_period = 400
 y_period = z_period = 2000
 g = gridlock.Grid([numpy.arange(-x_period/2, x_period/2, dx),
                   numpy.arange(-1000, 1000, dx),
                   numpy.arange(-1000, 1000, dx)],
                  shifts=numpy.array([[0,0,0]]),
                  initial=1.445**2,
                  periodic=True)
 g.draw_cuboid([0,0,0], [200e8, 220, 220], eps=3.47**2)
 #x_period = y_period = z_period = 13000
 #g = gridlock.Grid([numpy.arange(3), ]*3,
 #                  shifts=numpy.array([[0, 0, 0]]),
 #                  initial=2.0**2,
 #                  periodic=True)
 g2 = g.copy()
 g2.shifts = numpy.zeros((6,3))
 g2.grids = [numpy.zeros(g.shape) for _ in range(6)]
 epsilon = [g.grids[0],] * 3
 reciprocal_lattice = numpy.diag(1000/numpy.array([x_period, y_period, z_period])) #cols are vectors
 #print('Finding k at 1550nm')
 #k, f = bloch.find_k(frequency=1/1550,
 #                    tolerance=(1/1550 - 1/1551),
 #                    direction=[1, 0, 0],
 #                    G_matrix=reciprocal_lattice,
 #                    epsilon=epsilon,
 #                    band=0)
 #
 #print("k={}, f={}, 1/f={}, k/f={}".format(k, f, 1/f, norm(reciprocal_lattice @ k) / f ))
 print('Finding f at [0.25, 0, 0]')
 for k0x in [.25]:
    k0 = numpy.array([k0x, 0, 0])
    kmag = norm(reciprocal_lattice @ k0)
    tolerance = 1e-4 * (1/1550)   # df = f * dn_eff / n
    logger.info('tolerance {}'.format(tolerance))
    n, v = bloch.eigsolve(4, k0, G_matrix=reciprocal_lattice, epsilon=epsilon, tolerance=tolerance)
    v2e = bloch.hmn_2_exyz(k0, G_matrix=reciprocal_lattice, epsilon=epsilon)
    v2h = bloch.hmn_2_hxyz(k0, G_matrix=reciprocal_lattice, epsilon=epsilon)
    ki = bloch.generate_kmn(k0, reciprocal_lattice, g.shape)
    z = 0
    e = v2e(v[0])
    for i in range(3):
        g2.grids[i] += numpy.real(e[i])
        g2.grids[i+3] += numpy.imag(e[i])
    f = numpy.sqrt(numpy.real(numpy.abs(n))) # TODO
    print('k0x = {:3g}\n eigval = {}\n f = {}\n'.format(k0x, n, f))
    n_eff = norm(reciprocal_lattice @ k0) / f
    print('kmag/f = n_eff =  {} \n wl = {}\n'.format(n_eff, 1/f ))
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@ -438,23 +438,22 @@ def eigsolve(num_modes: int,
        v /= norm(v)
        logger.info('eigness {}: {}'.format(i, norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )))
    ev2 = eigvecs.copy()
    for i in range(len(eigvals)):
        logger.info('Refining eigenvector {}'.format(i))
-        eigvals[i], ev2[:, i] = rayleigh_quotient_iteration(scipy_op,
+        eigvals[i], eigvecs[:, i] = rayleigh_quotient_iteration(scipy_op,
                                                                guess_vector=eigvecs[:, i],
                                                                iterations=40,
                                                                tolerance=tolerance * numpy.real(numpy.sqrt(eigvals[i])) * 2,
                                                                solver = lambda A, b: spalg.bicgstab(A, b, maxiter=200)[0])
    eigvecs = ev2
    order = numpy.argsort(numpy.abs(eigvals))
    for i in range(len(eigvals)):
        v = eigvecs[:, i]
        n = eigvals[i]
        v /= norm(v)
-        logger.info('eigness {}: {}'.format(i, norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )))
+        eigness = norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )
        logger.info('eigness {}: {}'.format(i, eigness))
    order = numpy.argsort(numpy.abs(eigvals))
    return eigvals[order], eigvecs.T[order]
@ -499,3 +498,245 @@ def find_k(frequency: float,
    return res.x * direction, res.fun + frequency
 #def eigsolve2(num_modes: int,
 #             k0: numpy.ndarray,
 #             G_matrix: numpy.ndarray,
 #             epsilon: field_t,
 #             mu: field_t = None,
 #             tolerance = 1e-8,
 #             ) -> Tuple[numpy.ndarray, numpy.ndarray]:
 #    """
 #    Find the first (lowest-frequency) num_modes eigenmodes with Bloch wavevector
 #     k0 of the specified structure.
 #
 #    :param k0: Bloch wavevector, [k0x, k0y, k0z].
 #    :param G_matrix: 3x3 matrix, with reciprocal lattice vectors as columns.
 #    :param epsilon: Dielectric constant distribution for the simulation.
 #        All fields are sampled at cell centers (i.e., NOT Yee-gridded)
 #    :param mu: Magnetic permability distribution for the simulation.
 #        Default None (1 everywhere).
 #    :return: (eigenvalues, eigenvectors) where eigenvalues[i] corresponds to the
 #        vector eigenvectors[i, :]
 #    """
 #    h_size = 2 * epsilon[0].size
 #
 #
 #    '''
 #    Generate the operators
 #    '''
 #    mop = maxwell_operator(k0=k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
 #    imop = inverse_maxwell_operator_approx(k0=k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
 #
 #    scipy_op = spalg.LinearOperator(dtype=complex, shape=(h_size, h_size), matvec=mop)
 #    scipy_iop = spalg.LinearOperator(dtype=complex, shape=(h_size, h_size), matvec=imop)
 #
 #    y_shape = (h_size, num_modes)
 #
 #
 #    prev_E = 0
 #    d_scale = 1
 #    prev_traceGtKG = 0
 #    prev_theta = 0.5
 #    D = numpy.zeros(shape=y_shape, dtype=complex)
 #
 #    y0 = None
 #    if y0 is None:
 #        z = numpy.random.rand(*y_shape, dtype=complex)
 #    else:
 #        z = y0
 #
 #    while True:
 #        z2 = z.conj().T @ z
 #        z_norm = numpy.sqrt(numpy.real(numpy.trace(z2))) / num_modes
 #        z /= z_norm
 #        z2 /= z_norm * z_norm
 #        try:
 #            U = numpy.linalg.inv(z2)
 #        except numpy.linalg.LinAlgError:
 #            z = numpy.random.rand(*y_shape, dtype=complex)
 #            continue
 #
 #        trace_U = numpy.real(numpy.trace(U))
 #        if trace_U > 1e8 * num_modes:
 #            z = z @ scipy.linalg.sqrtm(U).conj().T
 #            prev_trace GtX = 0
 #            continue
 #        break
 #
 #
 #    def rayleigh_quotient(Z: numpy.ndarray, approx_grad: bool = True):
 #        """
 #        Absolute value of the block Rayleigh quotient, and the associated gradient.
 #
 #        See Johnson and Joannopoulos, Opt. Expr. 8, 3 (2001) for details (full
 #         citation in module docstring).
 #
 #        ===
 #
 #        Notes on my understanding of the procedure:
 #
 #        Minimize f(Y) = |trace((Y.H @ A @ Y)|, making use of Y = Z @ inv(Z.H @ Z)^(1/2)
 #         (a polar orthogonalization of Y). This gives f(Z) = |trace(Z.H @ A @ Z @ U)|,
 #         where U = inv(Z.H @ Z). We minimize the absolute value to find the eigenvalues
 #         with smallest magnitude.
 #
 #        The gradient is P @ (A @ Z @ U), where P = (1 - Z @ U @ Z.H) is a projection
 #         onto the space orthonormal to Z. If approx_grad is True, the approximate
 #         inverse of the maxwell operator is used to precondition the gradient.
 #        """
 #        z = Z.reshape(y_shape)
 #        U = numpy.linalg.inv(z.conj().T @ z)
 #        zU = z @ U
 #        AzU = scipy_op @ zU
 #        zTAzU = z.conj().T @ AzU
 #        f = numpy.real(numpy.trace(zTAzU))
 #        if approx_grad:
 #            df_dy = scipy_iop @ (AzU - zU @ zTAzU)
 #        else:
 #            df_dy = (AzU - zU @ zTAzU)
 #        logging.info('f={}'.format(f))
 #        return numpy.abs(f), numpy.sign(f) * numpy.real(df_dy).ravel()
 #
 #    max_iters = 10000 #TODO
 #    for iter in range(max_iters):
 #        f, G = rayleigh_quotient(z, False)
 #
 #        if iter > 0 and abs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7):
 #            break
 #
 #        KG = scipy_iop @ G
 #        traceGtKG = numpy.real(numpy.trace(G.conj().T @ KG)) + g_lag * g_lag
 #        gamma_numerator = traceGtKG
 #
 #        reset_iters = 1000 #TODO
 #        if prev_trace_GtKG == 0 or iter % reset_iters == 0:
 #            gamma = 0
 #        else:
 #            gamma = gamma_numerator / prev_trace_GtKG
 #
 #        D = gamma * d_scale * D + KG
 #        d_lag = gamma * d_scale * d_lag + g_lag
 #
 #
 #        d_scale = numpy.sqrt(numpy.real(numpy.sum(D.conj() * D))) / num_modes
 #        D /= d_scale
 #
 #        AD = A @ D
 #        DtD = D.conj().T @ D
 #        DtAD = D.conj().T @ AD
 #
 #        YtD = z.conj().T @ D
 #        YtAD = z.conj().T @ AD
 #        symYtD = (YtD + YtD.conj().T) * 0.5
 #        symYtAD = (YtD + YtAD.conj().T) * 0.5
 #
 #        U_sYtD = U @ symYtD #.H -- shouldn't matter?
 #
 #        dE = 2.0 * (real(trace(U.H @ symYtAD)) - real(trace(YtAYU.H @ U_sYtD)))
 #
 #        S2 = DtD - 4 * symYtD @ U_sYtD
 #        d2E = 2.0 * (real(trace(U.conj().T @ DtAD)) -
 #                     real(trace(YtAYU.conj().T @ U @ S2)) -
 #               4.0 * real(trace(U.conj().T @ symYtAD @ U_sYtD)))
 #
 #        d_lag = lag = trace_YtLY = trace_DtLD = trace_YtLD = 0
 #
 #        # Newton-Raphson to find a root of the first derivative:
 #        theta = -dE/d2E
 #
 #        if d2E < 0 or abs(theta) >= K_PI:
 #            theta = -abs(prev_theta) * numpy.sign(dE)
 #
 #        # YtAYU * YtBY = YtAY for use in linmin.
 #        YtAY = YtAYU @ YtBY.conj().T
 #
 #        theta, new_E, new_dE = linmin(theta, E, dE, 0.1, min(tolerance, 1e-6), 1e-14, 0, -numpy.sign(dE) * K_PI, trace_func, tfd)
 #
 #        improvement = numpy.abs(E - new_E) * 2 / numpy.abs(E + new_E)
 #        logging.info('linmin improvement {}'.format(improvement))
 #        z *= numpy.cos(theta)
 #        z += D * numpy.sin(theta)
 #
 #
 #        prev_traceGtKG = trace_GtKG
 #        prev_theta = theta
 #        prev_E = E
 #
 #
 #    def linmin(x_guess, f0, df0, x_max, f_tol=0.1, df_tol=min(tolerance, 1e-6), x_tol=1e-14, x_min=0, linmin_func):
 #        if df0 > 0:
 #            x0, f0, df0 = linmin(-x_guess, f0, -df0, -x_max, f_tol, df_tol, x_tol, -x_min, lambda q, dq: -linmin_func(q, dq))
 #            return -x0, f0, -df0
 #        elif df0 == 0:
 #            return 0, f0, df0
 #        else:
 #            x = x_guess
 #            fx = f0
 #            dfx = df0
 #
 #            dsrch(xxx,,)
 #            for i in range(int(1e6)):
 #                if task != 'F':
 #                    logging.info('search converged in {} iterations'.format(i))
 #                    break
 #                fx = f(x, dfx)
 #                dsrch(...)
 #
 #            return x, fx, dfx
 #
 #
 #
 #    '''
 #    Use the conjugate gradient method and the approximate gradient calculation to
 #     quickly find approximate eigenvectors.
 #    '''
 #    #result = scipy.optimize.minimize(rayleigh_quotient,
 #    #                                 numpy.random.rand(*y_shape),
 #    #                                 jac=True,
 #    #                                 method='CG',
 #    #                                 tol=1e-5,
 #    #                                 options={'maxiter': 30, 'disp':True})
 #
 #    #result = scipy.optimize.minimize(lambda y: rayleigh_quotient(y, False),
 #    #                                 result.x,
 #    #                                 jac=True,
 #    #                                 method='CG',
 #    #                                 tol=1e-13,
 #    #                                 options={'maxiter': 100, 'disp':True})
 #    #
 #    #z = result.x.reshape(y_shape)
 #
 #    '''
 #    Recover eigenvectors from Z
 #    '''
 #    U = numpy.linalg.inv(z.conj().T @ z)
 #    y = z @ scipy.linalg.sqrtm(U)
 #    w = y.conj().T @ (scipy_op @ y)
 #
 #    eigvals, w_eigvecs = numpy.linalg.eig(w)
 #    eigvecs = y @ w_eigvecs
 #
 #    for i in range(len(eigvals)):
 #        v = eigvecs[:, i]
 #        n = eigvals[i]
 #        v /= norm(v)
 #        logger.info('eigness {}: {}'.format(i, norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )))
 #
 #    for i in range(len(eigvals)):
 #        logger.info('Refining eigenvector {}'.format(i))
 #        eigvals[i], eigvecs[:, i] = rayleigh_quotient_iteration(scipy_op,
 #                                                                guess_vector=eigvecs[:, i],
 #                                                                iterations=40,
 #                                                                tolerance=tolerance * numpy.real(numpy.sqrt(eigvals[i])) * 2,
 #                                                                solver = lambda A, b: spalg.bicgstab(A, b, maxiter=200)[0])
 #
 #    for i in range(len(eigvals)):
 #        v = eigvecs[:, i]
 #        n = eigvals[i]
 #        v /= norm(v)
 #        eigness = norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )
 #        logger.info('eigness {}: {}'.format(i, eigness))
 #
 #    order = numpy.argsort(numpy.abs(eigvals))
 #    return eigvals[order], eigvecs.T[order]
--- a/fdfd_tools/eigensolver.c
+++ b/fdfd_tools/eigensolver.c
@ -0,0 +1,412 @@
 /* Copyright (C) 1999-2014 Massachusetts Institute of Technology.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include "config.h"
 #include <mpiglue.h>
 #include <mpi_utils.h>
 #include <check.h>
 #include <scalar.h>
 #include <matrices.h>
 #include <blasglue.h>
 #include "eigensolver.h"
 #include "linmin.h"
 extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals,
                                          evectoperator A, void *Adata,
                                          evectmatrix Work1, evectmatrix Work2,
                                          sqmatrix U, sqmatrix Usqrt,
                                          sqmatrix Uwork);
 #define STRINGIZEx(x) #x /* a hack so that we can stringize macro values */
 #define STRINGIZE(x) STRINGIZEx(x)
 #define K_PI 3.141592653589793238462643383279502884197
 #define MIN2(a,b) ((a) < (b) ? (a) : (b))
 #define MAX2(a,b) ((a) > (b) ? (a) : (b))
 #if defined(SCALAR_LONG_DOUBLE_PREC)
 #  define fabs fabsl
 #  define cos cosl
 #  define sin sinl
 #  define sqrt sqrtl
 #  define atan atanl
 #  define atan2 atan2l
 #endif
 /* Evalutate op, and set t to the elapsed time (in seconds). */
 #define TIME_OP(t, op) { \
     mpiglue_clock_t xxx_time_op_start_time = MPIGLUE_CLOCK; \
     { \
 	  op; \
     } \
     (t) = MPIGLUE_CLOCK_DIFF(MPIGLUE_CLOCK, xxx_time_op_start_time); \
 }
 /**************************************************************************/
 #define EIGENSOLVER_MAX_ITERATIONS 100000
 #define FEEDBACK_TIME 4.0 /* elapsed time before we print progress feedback */
 /* Number of iterations after which to reset conjugate gradient
   direction to steepest descent.  (Picked after some experimentation.
   Is there a better basis?  Should this change with the problem
   size?) */
 #define CG_RESET_ITERS 70
 /* Threshold for trace(1/YtBY) = trace(U) before we reorthogonalize: */
 #define EIGS_TRACE_U_THRESHOLD 1e8
 /**************************************************************************/
 /* estimated times/iteration for different iteration schemes, based
   on the measure times for various operations and the operation counts: */
 #define EXACT_LINMIN_TIME(t_AZ, t_KZ, t_ZtW, t_ZS, t_ZtZ, t_linmin) \
     ((t_AZ)*2 + (t_KZ) + (t_ZtW)*4 + (t_ZS)*2 + (t_ZtZ)*2 + (t_linmin))
 #define APPROX_LINMIN_TIME(t_AZ, t_KZ, t_ZtW, t_ZS, t_ZtZ) \
     ((t_AZ)*2 + (t_KZ) + (t_ZtW)*2 + (t_ZS)*2 + (t_ZtZ)*2)
 /* Guess for the convergence slowdown factor due to the approximate
   line minimization.  It is probably best to be conservative, as the
   exact line minimization is more reliable and we only want to
   abandon it if there is a big speed gain. */
 #define APPROX_LINMIN_SLOWDOWN_GUESS 2.0
 /* We also don't want to use the approximate line minimization if
   the exact line minimization makes a big difference in the value
   of the trace that's achieved (i.e. if one step of Newton's method
   on the trace derivative does not do a good job).  The following
   is the maximum improvement by the exact line minimization (over
   one step of Newton) at which we'll allow the use of approximate line
   minimization. */
 #define APPROX_LINMIN_IMPROVEMENT_THRESHOLD 0.05
 /**************************************************************************/
 typedef struct {
     sqmatrix YtAY, DtAD, symYtAD, YtBY, DtBD, symYtBD, S1, S2, S3;
     real lag, d_lag, trace_YtLY, trace_DtLD, trace_YtLD;
 } trace_func_data;
 static linmin_real trace_func(linmin_real theta, linmin_real *trace_deriv, void *data)
 {
     linmin_real trace;
     trace_func_data *d = (trace_func_data *) data;
     linmin_real c = cos(theta), s = sin(theta);
 	 YDNi = c*c * YtY + s*s * DtD + 2*s*c * symYtD
     YDNi.inv()
     if not YDNi.inv():
 	       /* if c or s is small, we sometimes run into numerical
 		  difficulties, and it is better to use the Taylor expansion */
 	       if c < 1e-4 * s and c != 0: 
 		    YDNi = DtD.inv()
 		    S3 = (YDNi @ symYtD.H) @ YDNi.H
 		    YDNi = 1/(s*s) * YDNi - 2*c/(s*s*s) * S3
 	       elif s < 1e-4 * c and s != 0:
 		    YDNi = YtY.inv()
 		    S3 = (YDNi @ symYtD.H) @ YDNi.H
 		    YDNi = 1/(c*c) * YDNi - 2*s/(c*c*c) * S3
 	       else:
 		    CHECK(0, "inexplicable singularity in linmin trace_func")
     YADN = c*c * YtAY + s*s * DtAD + 2*s*c * smYtAD
 	 trace = real(trace(YADN.H @ YDNi)) + (c*c * trace_YtLY + s*s * trace_DtLD + 2*s*c * trace_YtLD) * (c * lag + s * d_lag)
     if (trace_deriv) {
 	  c2 = cos(2*theta)
      s2 = sin(2*theta);
 	  S3 = -0.5 * s2 * (YtAY - DtAD) + c2 * symYtAD
      *trace_deriv = real(trace(YDNi.H @ S3))
 	  S2 = (YDNi @ YADN.H) @ YDNi.H
 	  S3 = -0.5 * s2 * (YtY - DtD) + c2 * symYtD
 	  *trace_deriv -= real(trace(S2.H @ S3))
 	  *trace_deriv *= 2
 	  *trace_deriv += (-s2 * trace_YtLY + s2 * trace_DtLD
 		  + 2*c2 * trace_YtLD) * (c * lag + s * d_lag);
 	  *trace_deriv += (c*c * trace_YtLY + s*s * trace_DtLD
 		  + 2*s*c * trace_YtLD) * (-s * lag + c * d_lag);
     }
     return trace;
 }
 /**************************************************************************/
 #define EIG_HISTORY_SIZE 5
 /* find generalized eigenvectors Y of (A,B) by minimizing Rayleigh quotient
        tr [ Yt A Y / (Yt B Y) ] + lag * tr [ Yt L Y ]
   where lag is a Lagrange multiplier and L is a Hermitian operator
   implementing some constraint tr [ Yt L Y ] = 0 on the eigenvectors
   (if L is not NULL).
   Constraints that commute with A and B (and L) are specified via the
   "constraint" argument, which gives the projection operator for
   the constraint(s).
 */
 void eigensolver_lagrange(evectmatrix Y, real *eigenvals,
 			  evectoperator A, void *Adata,
 			  evectoperator B, void *Bdata,
 			  evectpreconditioner K, void *Kdata,
 			  evectconstraint constraint, void *constraint_data,
 			  evectoperator L, void *Ldata, real *lag,
 			  evectmatrix Work[], int nWork,
 			  real tolerance, int *num_iterations,
 			  int flags)
 {
     real g_lag = 0, d_lag = 0, prev_g_lag = 0;
     short usingConjugateGradient = 0, use_polak_ribiere = 0,
 	   use_linmin = 1;
     real E, prev_E = 0.0;
     real d_scale = 1.0;
     real traceGtX, prev_traceGtX = 0.0;
     real theta, prev_theta = 0.5;
     int i, iteration = 0, num_emergency_restarts = 0;
     mpiglue_clock_t prev_feedback_time;
     real linmin_improvement = 0;
     G = Work[0];
     X = Work[1];
     BY = Y;
     D = X;
     BD = D; /* storage for B*D (re-use B*Y storage) */
     prev_G = G;
 restartY:
     eigenvals *= 0.0
 	 convergence_history = [10000.0] * n
 	 constraint(Y, constraint_data)
     do {
 	  real y_norm, gamma_numerator = 0;
      YtBY = Y.H @ Y
 	  y_norm = sqrt(real(trace(YtBY)) / Y.p);
 	  Y /= y_norm
 	  YtBY /= y_norm*y_norm
 	  U = YtBY
      if (!sqmatrix_invert(U, 1, S2)) /* non-independent Y columns */
 	       /* emergency restart with random Y */
          ...
 	  /* If trace(1/YtBY) gets big, it means that the columns
 	     of Y are becoming nearly parallel.  This sometimes happens,
 	     especially in the targeted eigensolver, because the
 	     preconditioner pushes all the columns towards the ground
 	     state.  If it gets too big, it seems to be a good idea
 	     to re-orthogonalize, resetting conjugate-gradient, as
 	     otherwise we start to encounter numerical problems. */
 	  if (flags & EIGS_REORTHOGONALIZE) {
 	       traceU = real(trace(U))
 	       if (traceU > EIGS_TRACE_U_THRESHOLD * U.p) {
 		    Y = Y @ sqrtm(U).H /* orthonormalize Y */
 		    prev_traceGtX = 0.0;
            YtBY = Y.H @ Y
 		    y_norm = sqrt(real(trace(YtBY)) / Y.p)
 		    Y /= y_norm
 		    YtBY /= y_norm * y_norm
 		    U = YtBY
 	  /* G = AYU; note that U is Hermitian: */
      G = A @ Y @ U
 	  YtAYU = Y.H @ G
 	  E = real(trace(YtAYU))
 	  convergence_history[iteration % EIG_HISTORY_SIZE] =
 	       200.0 * fabs(E - prev_E) / (fabs(E) + fabs(prev_E));
 	  if (iteration > 0 &&
              fabs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7))
               break; /* convergence!  hooray! */
 	  /* Compute gradient of functional: G = (1 - BY U Yt) A Y U */
 	  G += -Y @ (U @ YtAYU)
 	  /* set X = precondition(G): */
 	  X = K @ G
      //TIME_OP(time_KZ, K(G, X, Kdata, Y, NULL, YtBY));
 	  /* We have to apply the constraint here, in case it doesn't
             commute with the preconditioner. */
      constraint(X, constraint_data);
 	  d_scale = 1.0;
 	  /* Minimize the trace along Y + lambda*D: */
 	  if (!use_linmin) {
 	       real dE, E2, d2E, t, d_norm;
 	       /* Here, we do an approximate line minimization along D
 		  by evaluating dE (the derivative) at the current point,
 		  and the trace E2 at a second point, and then approximating
 		  the second derivative d2E by finite differences.  Then,
 		  we use one step of Newton's method on the derivative.
 	          This has the advantage of requiring two fewer O(np^2)
 	          matrix multiplications compared to the exact linmin. */
 	       d_norm = sqrt(real(trace(D.H @ D)) / Y.p);
 	       /* dE = 2 * tr Gt D.  (Use prev_G instead of G so that
 		  it works even when we are using Polak-Ribiere.) */
 	       dE = 2.0 * SCALAR_RE(evectmatrix_traceXtY(prev_G, D)) / d_norm;
 	       /* shift Y by prev_theta along D, in the downhill direction: */
 	       t = dE < 0 ? -fabs(prev_theta) : fabs(prev_theta);
 	       Y += t/d_norm * D
           U = inv(Y.H @ Y)
 	       E2 = real(trace((Y.H @ A @ Y).H @ U))
 	       /* Get finite-difference approximation for the 2nd derivative
 		  of the trace.  Equivalently, fit to a quadratic of the
 		  form:  E(theta) = E + dE theta + 1/2 d2E theta^2 */
 	       d2E = (E2 - E - dE * t) / (0.5 * t * t);
 	       theta = -dE/d2E;
 	       /* If the 2nd derivative is negative, or a big shift
 		  in the trace is predicted (compared to the previous
 		  iteration), then this approximate line minimization is
 		  probably not very good; switch back to the exact
 		  line minimization.  Hopefully, we won't have to
 		  abort like this very often, as it wastes operations. */
 	       if (d2E < 0 || -0.5*dE*theta > 20.0 * fabs(E-prev_E)) {
 		    if (flags & EIGS_FORCE_APPROX_LINMIN) {
 		    } else {
 			 use_linmin = 1;
 			 evectmatrix_aXpbY(1.0, Y, -t / d_norm, D);
 			 prev_theta = atan(prev_theta); /* convert to angle */
 			 /* don't do this again: */
 			 flags |= EIGS_FORCE_EXACT_LINMIN;
 		    }
 	       }
 	       else {
 		    /* Shift Y by theta, hopefully minimizing the trace: */
 		    evectmatrix_aXpbY(1.0, Y, (theta - t) / d_norm, D);
 	       }
 	  }
 	  if (use_linmin) {
 	       d_scale = sqrt(real(trace(D.H @ D)) / Y.p);
 	       D /= d_scale
 	       AD = A @ D
           DtD = D.H @ D
 	       DtAD = D.H @ AD
           YtD = Y.H @ D
           YtAD = Y.H @ AD
 	       symYtD = (YtD + YtD.H) / 2
 	       symYtAD = (YtAD + YtAD.H) / 2
 	       U_sYtD = U @ symYtD.H
 	       dE = 2.0 * (real(trace(U.H @ symYtAD)) - real(trace(YtAYU.H @ U_sYtD)))
 	       S2 = DtD - 4 * symYtD @ U_sYtD
 	       d2E = 2.0 * (real(trace(U.H @ DtAD)) -
                        real(trace(YtAYU.H @ U @ S2)) -
                  4.0 * real(trace(U.H @ symYtAD @ U_sYtD)))
 		   d_lag = lag = trace_YtLY = trace_DtLD = trace_YtLD = 0
 	       /* this is just Newton-Raphson to find a root of
 		  the first derivative: */
 	       theta = -dE/d2E;
 	       if d2E < 0 or abs(theta) >= K_PI:
 		    theta = -abs(prev_theta) * numpy.sign(dE)
 	       /* Set S1 to YtAYU * YtBY = YtAY for use in linmin.
 		  (tfd.YtAY == S1). */
 	       YtAY = YtAYU @ YtBY.H
 		   theta = linmin(&new_E, &new_dE, theta, E, dE,
 					   0.1, min(tolerance, 1e-6), 1e-14,
 					   0, -numpy.sign(dE) * K_PI,
 					   trace_func, &tfd,
 					   flags & EIGS_VERBOSE)
 		    linmin_improvement = abs(E - new_E) * 2.0/abs(E + new_E);
 	       /* Shift Y to new location minimizing the trace along D: */
 	       Y *= cos(theta)
           Y += D * sin(theta)
 	  }
 	  /* In exact arithmetic, we don't need to do this, but in practice
 	     it is probably a good idea to keep errors from adding up and
 	     eventually violating the constraints. */
      constraint(Y, constraint_data);
 	  prev_traceGtX = traceGtX;
      prev_theta = theta;
      prev_E = E;
 	  /* Finally, we use the times for the various operations to
 	     help us pick an algorithm for the next iteration: */
 	  {
 	       real t_exact, t_approx;
 	       t_exact = EXACT_LINMIN_TIME(time_AZ, time_KZ, time_ZtW,
 					   time_ZS, time_ZtZ, time_linmin);
 	       t_approx = APPROX_LINMIN_TIME(time_AZ, time_KZ, time_ZtW,
 					     time_ZS, time_ZtZ);
 	       /* Sum the times over the processors so that all the
 		  processors compare the same, average times. */
 	       mpi_allreduce_1(&t_exact,
 			       real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
 	       mpi_allreduce_1(&t_approx,
 			       real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
 	       if (!(flags & EIGS_FORCE_EXACT_LINMIN) &&
 		   linmin_improvement > 0 &&
 		   linmin_improvement <= APPROX_LINMIN_IMPROVEMENT_THRESHOLD &&
 		   t_exact > t_approx * APPROX_LINMIN_SLOWDOWN_GUESS) {
 		    use_linmin = 0;
 	       }
 	       else if (!(flags & EIGS_FORCE_APPROX_LINMIN)) {
 		    use_linmin = 1;
 		    prev_theta = atan(prev_theta); /* convert to angle */
 	       }
 	  }
     } while (++iteration < EIGENSOLVER_MAX_ITERATIONS);
     evectmatrix_XtX(U, Y, S2);
     CHECK(sqmatrix_invert(U, 1, S2), "singular YtBY at end");
     eigensolver_get_eigenvals_aux(Y, eigenvals, A, Adata,
 				   X, G, U, S1, S2);
 }