diff --git a/examples/bloch.py b/examples/bloch.py
new file mode 100644
index 0000000..6f34a3b
--- /dev/null
+++ b/examples/bloch.py
@@ -0,0 +1,68 @@
+import numpy, scipy, gridlock, fdfd_tools
+from fdfd_tools import bloch
+from numpy.linalg import norm
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+dx = 40
+x_period = 400
+y_period = z_period = 2000
+g = gridlock.Grid([numpy.arange(-x_period/2, x_period/2, dx),
+                   numpy.arange(-1000, 1000, dx),
+                   numpy.arange(-1000, 1000, dx)],
+                  shifts=numpy.array([[0,0,0]]),
+                  initial=1.445**2,
+                  periodic=True)
+
+g.draw_cuboid([0,0,0], [200e8, 220, 220], eps=3.47**2)
+
+#x_period = y_period = z_period = 13000
+#g = gridlock.Grid([numpy.arange(3), ]*3,
+#                  shifts=numpy.array([[0, 0, 0]]),
+#                  initial=2.0**2,
+#                  periodic=True)
+
+g2 = g.copy()
+g2.shifts = numpy.zeros((6,3))
+g2.grids = [numpy.zeros(g.shape) for _ in range(6)]
+
+epsilon = [g.grids[0],] * 3
+reciprocal_lattice = numpy.diag(1000/numpy.array([x_period, y_period, z_period])) #cols are vectors
+
+#print('Finding k at 1550nm')
+#k, f = bloch.find_k(frequency=1/1550,
+#                    tolerance=(1/1550 - 1/1551),
+#                    direction=[1, 0, 0],
+#                    G_matrix=reciprocal_lattice,
+#                    epsilon=epsilon,
+#                    band=0)
+#
+#print("k={}, f={}, 1/f={}, k/f={}".format(k, f, 1/f, norm(reciprocal_lattice @ k) / f ))
+
+print('Finding f at [0.25, 0, 0]')
+for k0x in [.25]:
+    k0 = numpy.array([k0x, 0, 0])
+
+    kmag = norm(reciprocal_lattice @ k0)
+    tolerance = 1e-4 * (1/1550)   # df = f * dn_eff / n
+    logger.info('tolerance {}'.format(tolerance))
+
+    n, v = bloch.eigsolve(4, k0, G_matrix=reciprocal_lattice, epsilon=epsilon, tolerance=tolerance)
+    v2e = bloch.hmn_2_exyz(k0, G_matrix=reciprocal_lattice, epsilon=epsilon)
+    v2h = bloch.hmn_2_hxyz(k0, G_matrix=reciprocal_lattice, epsilon=epsilon)
+    ki = bloch.generate_kmn(k0, reciprocal_lattice, g.shape)
+
+    z = 0
+    e = v2e(v[0])
+    for i in range(3):
+        g2.grids[i] += numpy.real(e[i])
+        g2.grids[i+3] += numpy.imag(e[i])
+
+    f = numpy.sqrt(numpy.real(numpy.abs(n))) # TODO
+    print('k0x = {:3g}\n eigval = {}\n f = {}\n'.format(k0x, n, f))
+    n_eff = norm(reciprocal_lattice @ k0) / f
+    print('kmag/f = n_eff =  {} \n wl = {}\n'.format(n_eff, 1/f ))
+
diff --git a/fdfd_tools/bloch.py b/fdfd_tools/bloch.py
index ad7c555..7e41e14 100644
--- a/fdfd_tools/bloch.py
+++ b/fdfd_tools/bloch.py
@@ -438,23 +438,22 @@ def eigsolve(num_modes: int,
         v /= norm(v)
         logger.info('eigness {}: {}'.format(i, norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )))
 
-    ev2 = eigvecs.copy()
     for i in range(len(eigvals)):
         logger.info('Refining eigenvector {}'.format(i))
-        eigvals[i], ev2[:, i] = rayleigh_quotient_iteration(scipy_op,
-                                                            guess_vector=eigvecs[:, i],
-                                                            iterations=40,
-                                                            tolerance=tolerance * numpy.real(numpy.sqrt(eigvals[i])) * 2,
-                                                            solver = lambda A, b: spalg.bicgstab(A, b, maxiter=200)[0])
-    eigvecs = ev2
-    order = numpy.argsort(numpy.abs(eigvals))
+        eigvals[i], eigvecs[:, i] = rayleigh_quotient_iteration(scipy_op,
+                                                                guess_vector=eigvecs[:, i],
+                                                                iterations=40,
+                                                                tolerance=tolerance * numpy.real(numpy.sqrt(eigvals[i])) * 2,
+                                                                solver = lambda A, b: spalg.bicgstab(A, b, maxiter=200)[0])
 
     for i in range(len(eigvals)):
         v = eigvecs[:, i]
         n = eigvals[i]
         v /= norm(v)
-        logger.info('eigness {}: {}'.format(i, norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )))
+        eigness = norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )
+        logger.info('eigness {}: {}'.format(i, eigness))
 
+    order = numpy.argsort(numpy.abs(eigvals))
     return eigvals[order], eigvecs.T[order]
 
 
@@ -499,3 +498,245 @@ def find_k(frequency: float,
     return res.x * direction, res.fun + frequency
 
 
+
+#def eigsolve2(num_modes: int,
+#             k0: numpy.ndarray,
+#             G_matrix: numpy.ndarray,
+#             epsilon: field_t,
+#             mu: field_t = None,
+#             tolerance = 1e-8,
+#             ) -> Tuple[numpy.ndarray, numpy.ndarray]:
+#    """
+#    Find the first (lowest-frequency) num_modes eigenmodes with Bloch wavevector
+#     k0 of the specified structure.
+#
+#    :param k0: Bloch wavevector, [k0x, k0y, k0z].
+#    :param G_matrix: 3x3 matrix, with reciprocal lattice vectors as columns.
+#    :param epsilon: Dielectric constant distribution for the simulation.
+#        All fields are sampled at cell centers (i.e., NOT Yee-gridded)
+#    :param mu: Magnetic permability distribution for the simulation.
+#        Default None (1 everywhere).
+#    :return: (eigenvalues, eigenvectors) where eigenvalues[i] corresponds to the
+#        vector eigenvectors[i, :]
+#    """
+#    h_size = 2 * epsilon[0].size
+#
+#
+#    '''
+#    Generate the operators
+#    '''
+#    mop = maxwell_operator(k0=k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
+#    imop = inverse_maxwell_operator_approx(k0=k0, G_matrix=G_matrix, epsilon=epsilon, mu=mu)
+#
+#    scipy_op = spalg.LinearOperator(dtype=complex, shape=(h_size, h_size), matvec=mop)
+#    scipy_iop = spalg.LinearOperator(dtype=complex, shape=(h_size, h_size), matvec=imop)
+#
+#    y_shape = (h_size, num_modes)
+#
+#
+#    prev_E = 0
+#    d_scale = 1
+#    prev_traceGtKG = 0
+#    prev_theta = 0.5
+#    D = numpy.zeros(shape=y_shape, dtype=complex)
+#
+#    y0 = None
+#    if y0 is None:
+#        z = numpy.random.rand(*y_shape, dtype=complex)
+#    else:
+#        z = y0
+#
+#    while True:
+#        z2 = z.conj().T @ z
+#        z_norm = numpy.sqrt(numpy.real(numpy.trace(z2))) / num_modes
+#        z /= z_norm
+#        z2 /= z_norm * z_norm
+#        try:
+#            U = numpy.linalg.inv(z2)
+#        except numpy.linalg.LinAlgError:
+#            z = numpy.random.rand(*y_shape, dtype=complex)
+#            continue
+#
+#        trace_U = numpy.real(numpy.trace(U))
+#        if trace_U > 1e8 * num_modes:
+#            z = z @ scipy.linalg.sqrtm(U).conj().T
+#            prev_trace GtX = 0
+#            continue
+#        break
+#
+#
+#    def rayleigh_quotient(Z: numpy.ndarray, approx_grad: bool = True):
+#        """
+#        Absolute value of the block Rayleigh quotient, and the associated gradient.
+#
+#        See Johnson and Joannopoulos, Opt. Expr. 8, 3 (2001) for details (full
+#         citation in module docstring).
+#
+#        ===
+#
+#        Notes on my understanding of the procedure:
+#
+#        Minimize f(Y) = |trace((Y.H @ A @ Y)|, making use of Y = Z @ inv(Z.H @ Z)^(1/2)
+#         (a polar orthogonalization of Y). This gives f(Z) = |trace(Z.H @ A @ Z @ U)|,
+#         where U = inv(Z.H @ Z). We minimize the absolute value to find the eigenvalues
+#         with smallest magnitude.
+#
+#        The gradient is P @ (A @ Z @ U), where P = (1 - Z @ U @ Z.H) is a projection
+#         onto the space orthonormal to Z. If approx_grad is True, the approximate
+#         inverse of the maxwell operator is used to precondition the gradient.
+#        """
+#        z = Z.reshape(y_shape)
+#        U = numpy.linalg.inv(z.conj().T @ z)
+#        zU = z @ U
+#        AzU = scipy_op @ zU
+#        zTAzU = z.conj().T @ AzU
+#        f = numpy.real(numpy.trace(zTAzU))
+#        if approx_grad:
+#            df_dy = scipy_iop @ (AzU - zU @ zTAzU)
+#        else:
+#            df_dy = (AzU - zU @ zTAzU)
+#        logging.info('f={}'.format(f))
+#        return numpy.abs(f), numpy.sign(f) * numpy.real(df_dy).ravel()
+#
+#    max_iters = 10000 #TODO
+#    for iter in range(max_iters):
+#        f, G = rayleigh_quotient(z, False)
+#
+#        if iter > 0 and abs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7):
+#            break
+#
+#        KG = scipy_iop @ G
+#        traceGtKG = numpy.real(numpy.trace(G.conj().T @ KG)) + g_lag * g_lag
+#        gamma_numerator = traceGtKG
+#
+#        reset_iters = 1000 #TODO
+#        if prev_trace_GtKG == 0 or iter % reset_iters == 0:
+#            gamma = 0
+#        else:
+#            gamma = gamma_numerator / prev_trace_GtKG
+#
+#        D = gamma * d_scale * D + KG
+#        d_lag = gamma * d_scale * d_lag + g_lag
+#
+#
+#        d_scale = numpy.sqrt(numpy.real(numpy.sum(D.conj() * D))) / num_modes
+#        D /= d_scale
+#
+#        AD = A @ D
+#        DtD = D.conj().T @ D
+#        DtAD = D.conj().T @ AD
+#
+#        YtD = z.conj().T @ D
+#        YtAD = z.conj().T @ AD
+#        symYtD = (YtD + YtD.conj().T) * 0.5
+#        symYtAD = (YtD + YtAD.conj().T) * 0.5
+#
+#        U_sYtD = U @ symYtD #.H -- shouldn't matter?
+#
+#        dE = 2.0 * (real(trace(U.H @ symYtAD)) - real(trace(YtAYU.H @ U_sYtD)))
+#
+#        S2 = DtD - 4 * symYtD @ U_sYtD
+#        d2E = 2.0 * (real(trace(U.conj().T @ DtAD)) -
+#                     real(trace(YtAYU.conj().T @ U @ S2)) -
+#               4.0 * real(trace(U.conj().T @ symYtAD @ U_sYtD)))
+#
+#        d_lag = lag = trace_YtLY = trace_DtLD = trace_YtLD = 0
+#
+#        # Newton-Raphson to find a root of the first derivative:
+#        theta = -dE/d2E
+#
+#        if d2E < 0 or abs(theta) >= K_PI:
+#            theta = -abs(prev_theta) * numpy.sign(dE)
+#
+#        # YtAYU * YtBY = YtAY for use in linmin.
+#        YtAY = YtAYU @ YtBY.conj().T
+#
+#        theta, new_E, new_dE = linmin(theta, E, dE, 0.1, min(tolerance, 1e-6), 1e-14, 0, -numpy.sign(dE) * K_PI, trace_func, tfd)
+#
+#        improvement = numpy.abs(E - new_E) * 2 / numpy.abs(E + new_E)
+#        logging.info('linmin improvement {}'.format(improvement))
+#        z *= numpy.cos(theta)
+#        z += D * numpy.sin(theta)
+#
+#
+#        prev_traceGtKG = trace_GtKG
+#        prev_theta = theta
+#        prev_E = E
+#
+#
+#    def linmin(x_guess, f0, df0, x_max, f_tol=0.1, df_tol=min(tolerance, 1e-6), x_tol=1e-14, x_min=0, linmin_func):
+#        if df0 > 0:
+#            x0, f0, df0 = linmin(-x_guess, f0, -df0, -x_max, f_tol, df_tol, x_tol, -x_min, lambda q, dq: -linmin_func(q, dq))
+#            return -x0, f0, -df0
+#        elif df0 == 0:
+#            return 0, f0, df0
+#        else:
+#            x = x_guess
+#            fx = f0
+#            dfx = df0
+#
+#            dsrch(xxx,,)
+#            for i in range(int(1e6)):
+#                if task != 'F':
+#                    logging.info('search converged in {} iterations'.format(i))
+#                    break
+#                fx = f(x, dfx)
+#                dsrch(...)
+#
+#            return x, fx, dfx
+#
+#
+#
+#    '''
+#    Use the conjugate gradient method and the approximate gradient calculation to
+#     quickly find approximate eigenvectors.
+#    '''
+#    #result = scipy.optimize.minimize(rayleigh_quotient,
+#    #                                 numpy.random.rand(*y_shape),
+#    #                                 jac=True,
+#    #                                 method='CG',
+#    #                                 tol=1e-5,
+#    #                                 options={'maxiter': 30, 'disp':True})
+#
+#    #result = scipy.optimize.minimize(lambda y: rayleigh_quotient(y, False),
+#    #                                 result.x,
+#    #                                 jac=True,
+#    #                                 method='CG',
+#    #                                 tol=1e-13,
+#    #                                 options={'maxiter': 100, 'disp':True})
+#    #
+#    #z = result.x.reshape(y_shape)
+#
+#    '''
+#    Recover eigenvectors from Z
+#    '''
+#    U = numpy.linalg.inv(z.conj().T @ z)
+#    y = z @ scipy.linalg.sqrtm(U)
+#    w = y.conj().T @ (scipy_op @ y)
+#
+#    eigvals, w_eigvecs = numpy.linalg.eig(w)
+#    eigvecs = y @ w_eigvecs
+#
+#    for i in range(len(eigvals)):
+#        v = eigvecs[:, i]
+#        n = eigvals[i]
+#        v /= norm(v)
+#        logger.info('eigness {}: {}'.format(i, norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )))
+#
+#    for i in range(len(eigvals)):
+#        logger.info('Refining eigenvector {}'.format(i))
+#        eigvals[i], eigvecs[:, i] = rayleigh_quotient_iteration(scipy_op,
+#                                                                guess_vector=eigvecs[:, i],
+#                                                                iterations=40,
+#                                                                tolerance=tolerance * numpy.real(numpy.sqrt(eigvals[i])) * 2,
+#                                                                solver = lambda A, b: spalg.bicgstab(A, b, maxiter=200)[0])
+#
+#    for i in range(len(eigvals)):
+#        v = eigvecs[:, i]
+#        n = eigvals[i]
+#        v /= norm(v)
+#        eigness = norm(scipy_op @ v - (v.conj() @ (scipy_op @ v)) * v )
+#        logger.info('eigness {}: {}'.format(i, eigness))
+#
+#    order = numpy.argsort(numpy.abs(eigvals))
+#    return eigvals[order], eigvecs.T[order]
diff --git a/fdfd_tools/eigensolver.c b/fdfd_tools/eigensolver.c
new file mode 100644
index 0000000..4109879
--- /dev/null
+++ b/fdfd_tools/eigensolver.c
@@ -0,0 +1,412 @@
+/* Copyright (C) 1999-2014 Massachusetts Institute of Technology.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "config.h"
+#include <mpiglue.h>
+#include <mpi_utils.h>
+#include <check.h>
+#include <scalar.h>
+#include <matrices.h>
+#include <blasglue.h>
+
+#include "eigensolver.h"
+#include "linmin.h"
+
+extern void eigensolver_get_eigenvals_aux(evectmatrix Y, real *eigenvals,
+                                          evectoperator A, void *Adata,
+                                          evectmatrix Work1, evectmatrix Work2,
+                                          sqmatrix U, sqmatrix Usqrt,
+                                          sqmatrix Uwork);
+
+#define STRINGIZEx(x) #x /* a hack so that we can stringize macro values */
+#define STRINGIZE(x) STRINGIZEx(x)
+
+#define K_PI 3.141592653589793238462643383279502884197
+#define MIN2(a,b) ((a) < (b) ? (a) : (b))
+#define MAX2(a,b) ((a) > (b) ? (a) : (b))
+
+#if defined(SCALAR_LONG_DOUBLE_PREC)
+#  define fabs fabsl
+#  define cos cosl
+#  define sin sinl
+#  define sqrt sqrtl
+#  define atan atanl
+#  define atan2 atan2l
+#endif
+
+/* Evalutate op, and set t to the elapsed time (in seconds). */
+#define TIME_OP(t, op) { \
+     mpiglue_clock_t xxx_time_op_start_time = MPIGLUE_CLOCK; \
+     { \
+	  op; \
+     } \
+     (t) = MPIGLUE_CLOCK_DIFF(MPIGLUE_CLOCK, xxx_time_op_start_time); \
+}
+
+/**************************************************************************/
+
+#define EIGENSOLVER_MAX_ITERATIONS 100000
+#define FEEDBACK_TIME 4.0 /* elapsed time before we print progress feedback */
+
+/* Number of iterations after which to reset conjugate gradient
+   direction to steepest descent.  (Picked after some experimentation.
+   Is there a better basis?  Should this change with the problem
+   size?) */
+#define CG_RESET_ITERS 70
+
+/* Threshold for trace(1/YtBY) = trace(U) before we reorthogonalize: */
+#define EIGS_TRACE_U_THRESHOLD 1e8
+
+/**************************************************************************/
+
+/* estimated times/iteration for different iteration schemes, based
+   on the measure times for various operations and the operation counts: */
+
+#define EXACT_LINMIN_TIME(t_AZ, t_KZ, t_ZtW, t_ZS, t_ZtZ, t_linmin) \
+     ((t_AZ)*2 + (t_KZ) + (t_ZtW)*4 + (t_ZS)*2 + (t_ZtZ)*2 + (t_linmin))
+
+#define APPROX_LINMIN_TIME(t_AZ, t_KZ, t_ZtW, t_ZS, t_ZtZ) \
+     ((t_AZ)*2 + (t_KZ) + (t_ZtW)*2 + (t_ZS)*2 + (t_ZtZ)*2)
+
+/* Guess for the convergence slowdown factor due to the approximate
+   line minimization.  It is probably best to be conservative, as the
+   exact line minimization is more reliable and we only want to
+   abandon it if there is a big speed gain. */
+#define APPROX_LINMIN_SLOWDOWN_GUESS 2.0
+
+/* We also don't want to use the approximate line minimization if
+   the exact line minimization makes a big difference in the value
+   of the trace that's achieved (i.e. if one step of Newton's method
+   on the trace derivative does not do a good job).  The following
+   is the maximum improvement by the exact line minimization (over
+   one step of Newton) at which we'll allow the use of approximate line
+   minimization. */
+#define APPROX_LINMIN_IMPROVEMENT_THRESHOLD 0.05
+
+/**************************************************************************/
+
+typedef struct {
+     sqmatrix YtAY, DtAD, symYtAD, YtBY, DtBD, symYtBD, S1, S2, S3;
+     real lag, d_lag, trace_YtLY, trace_DtLD, trace_YtLD;
+} trace_func_data;
+
+static linmin_real trace_func(linmin_real theta, linmin_real *trace_deriv, void *data)
+{
+     linmin_real trace;
+     trace_func_data *d = (trace_func_data *) data;
+     linmin_real c = cos(theta), s = sin(theta);
+
+	 YDNi = c*c * YtY + s*s * DtD + 2*s*c * symYtD
+     YDNi.inv()
+
+     if not YDNi.inv():
+	       /* if c or s is small, we sometimes run into numerical
+		  difficulties, and it is better to use the Taylor expansion */
+	       if c < 1e-4 * s and c != 0: 
+		    YDNi = DtD.inv()
+		    S3 = (YDNi @ symYtD.H) @ YDNi.H
+		    YDNi = 1/(s*s) * YDNi - 2*c/(s*s*s) * S3
+	       elif s < 1e-4 * c and s != 0:
+		    YDNi = YtY.inv()
+		    S3 = (YDNi @ symYtD.H) @ YDNi.H
+		    YDNi = 1/(c*c) * YDNi - 2*s/(c*c*c) * S3
+	       else:
+		    CHECK(0, "inexplicable singularity in linmin trace_func")
+
+     YADN = c*c * YtAY + s*s * DtAD + 2*s*c * smYtAD
+	 trace = real(trace(YADN.H @ YDNi)) + (c*c * trace_YtLY + s*s * trace_DtLD + 2*s*c * trace_YtLD) * (c * lag + s * d_lag)
+
+     if (trace_deriv) {
+	  c2 = cos(2*theta)
+      s2 = sin(2*theta);
+
+	  S3 = -0.5 * s2 * (YtAY - DtAD) + c2 * symYtAD
+
+      *trace_deriv = real(trace(YDNi.H @ S3))
+
+	  S2 = (YDNi @ YADN.H) @ YDNi.H
+	  S3 = -0.5 * s2 * (YtY - DtD) + c2 * symYtD
+
+	  *trace_deriv -= real(trace(S2.H @ S3))
+	  *trace_deriv *= 2
+
+	  *trace_deriv += (-s2 * trace_YtLY + s2 * trace_DtLD
+		  + 2*c2 * trace_YtLD) * (c * lag + s * d_lag);
+	  *trace_deriv += (c*c * trace_YtLY + s*s * trace_DtLD
+		  + 2*s*c * trace_YtLD) * (-s * lag + c * d_lag);
+     }
+
+     return trace;
+}
+
+/**************************************************************************/
+
+#define EIG_HISTORY_SIZE 5
+
+/* find generalized eigenvectors Y of (A,B) by minimizing Rayleigh quotient
+
+        tr [ Yt A Y / (Yt B Y) ] + lag * tr [ Yt L Y ]
+
+   where lag is a Lagrange multiplier and L is a Hermitian operator
+   implementing some constraint tr [ Yt L Y ] = 0 on the eigenvectors
+   (if L is not NULL).
+
+   Constraints that commute with A and B (and L) are specified via the
+   "constraint" argument, which gives the projection operator for
+   the constraint(s).
+*/
+
+void eigensolver_lagrange(evectmatrix Y, real *eigenvals,
+			  evectoperator A, void *Adata,
+			  evectoperator B, void *Bdata,
+			  evectpreconditioner K, void *Kdata,
+			  evectconstraint constraint, void *constraint_data,
+			  evectoperator L, void *Ldata, real *lag,
+			  evectmatrix Work[], int nWork,
+			  real tolerance, int *num_iterations,
+			  int flags)
+{
+     real g_lag = 0, d_lag = 0, prev_g_lag = 0;
+     short usingConjugateGradient = 0, use_polak_ribiere = 0,
+	   use_linmin = 1;
+     real E, prev_E = 0.0;
+     real d_scale = 1.0;
+     real traceGtX, prev_traceGtX = 0.0;
+     real theta, prev_theta = 0.5;
+     int i, iteration = 0, num_emergency_restarts = 0;
+     mpiglue_clock_t prev_feedback_time;
+     real linmin_improvement = 0;
+
+     G = Work[0];
+     X = Work[1];
+
+     BY = Y;
+     D = X;
+     BD = D; /* storage for B*D (re-use B*Y storage) */
+     prev_G = G;
+
+ restartY:
+
+     eigenvals *= 0.0
+	 convergence_history = [10000.0] * n
+	 constraint(Y, constraint_data)
+
+     do {
+	  real y_norm, gamma_numerator = 0;
+
+      YtBY = Y.H @ Y
+	  y_norm = sqrt(real(trace(YtBY)) / Y.p);
+	  Y /= y_norm
+	  YtBY /= y_norm*y_norm
+	  U = YtBY
+
+      if (!sqmatrix_invert(U, 1, S2)) /* non-independent Y columns */
+	       /* emergency restart with random Y */
+          ...
+
+	  /* If trace(1/YtBY) gets big, it means that the columns
+	     of Y are becoming nearly parallel.  This sometimes happens,
+	     especially in the targeted eigensolver, because the
+	     preconditioner pushes all the columns towards the ground
+	     state.  If it gets too big, it seems to be a good idea
+	     to re-orthogonalize, resetting conjugate-gradient, as
+	     otherwise we start to encounter numerical problems. */
+	  if (flags & EIGS_REORTHOGONALIZE) {
+	       traceU = real(trace(U))
+	       if (traceU > EIGS_TRACE_U_THRESHOLD * U.p) {
+		    Y = Y @ sqrtm(U).H /* orthonormalize Y */
+		    prev_traceGtX = 0.0;
+
+            YtBY = Y.H @ Y
+		    y_norm = sqrt(real(trace(YtBY)) / Y.p)
+		    Y /= y_norm
+		    YtBY /= y_norm * y_norm
+		    U = YtBY
+
+	  /* G = AYU; note that U is Hermitian: */
+      G = A @ Y @ U
+	  YtAYU = Y.H @ G
+	  E = real(trace(YtAYU))
+
+	  convergence_history[iteration % EIG_HISTORY_SIZE] =
+	       200.0 * fabs(E - prev_E) / (fabs(E) + fabs(prev_E));
+
+	  if (iteration > 0 &&
+              fabs(E - prev_E) < tolerance * 0.5 * (E + prev_E + 1e-7))
+               break; /* convergence!  hooray! */
+
+	  /* Compute gradient of functional: G = (1 - BY U Yt) A Y U */
+	  G += -Y @ (U @ YtAYU)
+
+	  /* set X = precondition(G): */
+	  X = K @ G
+      //TIME_OP(time_KZ, K(G, X, Kdata, Y, NULL, YtBY));
+	  /* We have to apply the constraint here, in case it doesn't
+             commute with the preconditioner. */
+      constraint(X, constraint_data);
+
+	  d_scale = 1.0;
+
+	  /* Minimize the trace along Y + lambda*D: */
+
+	  if (!use_linmin) {
+	       real dE, E2, d2E, t, d_norm;
+
+	       /* Here, we do an approximate line minimization along D
+		  by evaluating dE (the derivative) at the current point,
+		  and the trace E2 at a second point, and then approximating
+		  the second derivative d2E by finite differences.  Then,
+		  we use one step of Newton's method on the derivative.
+	          This has the advantage of requiring two fewer O(np^2)
+	          matrix multiplications compared to the exact linmin. */
+
+	       d_norm = sqrt(real(trace(D.H @ D)) / Y.p);
+
+	       /* dE = 2 * tr Gt D.  (Use prev_G instead of G so that
+		  it works even when we are using Polak-Ribiere.) */
+	       dE = 2.0 * SCALAR_RE(evectmatrix_traceXtY(prev_G, D)) / d_norm;
+
+	       /* shift Y by prev_theta along D, in the downhill direction: */
+	       t = dE < 0 ? -fabs(prev_theta) : fabs(prev_theta);
+	       Y += t/d_norm * D
+
+           U = inv(Y.H @ Y)
+	       E2 = real(trace((Y.H @ A @ Y).H @ U))
+
+	       /* Get finite-difference approximation for the 2nd derivative
+		  of the trace.  Equivalently, fit to a quadratic of the
+		  form:  E(theta) = E + dE theta + 1/2 d2E theta^2 */
+	       d2E = (E2 - E - dE * t) / (0.5 * t * t);
+
+	       theta = -dE/d2E;
+
+	       /* If the 2nd derivative is negative, or a big shift
+		  in the trace is predicted (compared to the previous
+		  iteration), then this approximate line minimization is
+		  probably not very good; switch back to the exact
+		  line minimization.  Hopefully, we won't have to
+		  abort like this very often, as it wastes operations. */
+	       if (d2E < 0 || -0.5*dE*theta > 20.0 * fabs(E-prev_E)) {
+		    if (flags & EIGS_FORCE_APPROX_LINMIN) {
+		    } else {
+			 use_linmin = 1;
+			 evectmatrix_aXpbY(1.0, Y, -t / d_norm, D);
+			 prev_theta = atan(prev_theta); /* convert to angle */
+			 /* don't do this again: */
+			 flags |= EIGS_FORCE_EXACT_LINMIN;
+		    }
+	       }
+	       else {
+		    /* Shift Y by theta, hopefully minimizing the trace: */
+		    evectmatrix_aXpbY(1.0, Y, (theta - t) / d_norm, D);
+	       }
+	  }
+	  if (use_linmin) {
+	       d_scale = sqrt(real(trace(D.H @ D)) / Y.p);
+	       D /= d_scale
+
+	       AD = A @ D
+           DtD = D.H @ D
+	       DtAD = D.H @ AD
+
+           YtD = Y.H @ D
+           YtAD = Y.H @ AD
+	       symYtD = (YtD + YtD.H) / 2
+	       symYtAD = (YtAD + YtAD.H) / 2
+
+	       U_sYtD = U @ symYtD.H
+	       dE = 2.0 * (real(trace(U.H @ symYtAD)) - real(trace(YtAYU.H @ U_sYtD)))
+
+	       S2 = DtD - 4 * symYtD @ U_sYtD
+	       d2E = 2.0 * (real(trace(U.H @ DtAD)) -
+                        real(trace(YtAYU.H @ U @ S2)) -
+                  4.0 * real(trace(U.H @ symYtAD @ U_sYtD)))
+
+		   d_lag = lag = trace_YtLY = trace_DtLD = trace_YtLD = 0
+
+	       /* this is just Newton-Raphson to find a root of
+		  the first derivative: */
+	       theta = -dE/d2E;
+
+	       if d2E < 0 or abs(theta) >= K_PI:
+		    theta = -abs(prev_theta) * numpy.sign(dE)
+
+	       /* Set S1 to YtAYU * YtBY = YtAY for use in linmin.
+		  (tfd.YtAY == S1). */
+	       YtAY = YtAYU @ YtBY.H
+
+		   theta = linmin(&new_E, &new_dE, theta, E, dE,
+					   0.1, min(tolerance, 1e-6), 1e-14,
+					   0, -numpy.sign(dE) * K_PI,
+					   trace_func, &tfd,
+					   flags & EIGS_VERBOSE)
+		    linmin_improvement = abs(E - new_E) * 2.0/abs(E + new_E);
+
+	       /* Shift Y to new location minimizing the trace along D: */
+	       Y *= cos(theta)
+           Y += D * sin(theta)
+	  }
+
+	  /* In exact arithmetic, we don't need to do this, but in practice
+	     it is probably a good idea to keep errors from adding up and
+	     eventually violating the constraints. */
+      constraint(Y, constraint_data);
+
+	  prev_traceGtX = traceGtX;
+      prev_theta = theta;
+      prev_E = E;
+
+	  /* Finally, we use the times for the various operations to
+	     help us pick an algorithm for the next iteration: */
+	  {
+	       real t_exact, t_approx;
+	       t_exact = EXACT_LINMIN_TIME(time_AZ, time_KZ, time_ZtW,
+					   time_ZS, time_ZtZ, time_linmin);
+	       t_approx = APPROX_LINMIN_TIME(time_AZ, time_KZ, time_ZtW,
+					     time_ZS, time_ZtZ);
+
+	       /* Sum the times over the processors so that all the
+		  processors compare the same, average times. */
+	       mpi_allreduce_1(&t_exact,
+			       real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
+	       mpi_allreduce_1(&t_approx,
+			       real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
+
+	       if (!(flags & EIGS_FORCE_EXACT_LINMIN) &&
+		   linmin_improvement > 0 &&
+		   linmin_improvement <= APPROX_LINMIN_IMPROVEMENT_THRESHOLD &&
+		   t_exact > t_approx * APPROX_LINMIN_SLOWDOWN_GUESS) {
+		    use_linmin = 0;
+	       }
+	       else if (!(flags & EIGS_FORCE_APPROX_LINMIN)) {
+		    use_linmin = 1;
+		    prev_theta = atan(prev_theta); /* convert to angle */
+	       }
+	  }
+     } while (++iteration < EIGENSOLVER_MAX_ITERATIONS);
+
+     evectmatrix_XtX(U, Y, S2);
+     CHECK(sqmatrix_invert(U, 1, S2), "singular YtBY at end");
+     eigensolver_get_eigenvals_aux(Y, eigenvals, A, Adata,
+				   X, G, U, S1, S2);
+
+}
+