diff --git a/Makefile b/Makefile
index 247b809..c06b0d8 100644
--- a/Makefile
+++ b/Makefile
@@ -42,8 +42,9 @@ TEST_OBJECTS = $(TDIR)/obj/seatest.o \
                $(TDIR)/obj/run_tests.o
 
 LINK_COMMON = -lm
-LINK_FFTW = -L$(FFTWROOT) -lfftw3 -lfftw3f
-LINK_MKL = -L${MKLROOT}/lib/intel64 -lmkl_rt -Wl,--no-as-needed -lpthread -lm -ldl
+LINK_FFTW = -L$(FFTW_HOME)/lib -lfftw3 -lfftw3f -Wl,-rpath,${FFTW_HOME}/lib
+CFLAGS_FFTW = -I"${FFTW_HOME}/include"
+LINK_MKL = -L${MKLROOT}/lib -lmkl_rt -Wl,--no-as-needed,-rpath,${MKLROOT}/lib -lpthread -lm -ldl
 CFLAGS_MKL = -m64  -I"${MKLROOT}/include"
 
 all: $(LDIR)/libradial_functional.so ${LDIR}/libksw_estimator.so ${LDIR}/libksw_fisher.so
@@ -64,25 +65,25 @@ $(ODIR)/hyperspherical.o: $(HS_SDIR)/hyperspherical.c $(HS_IDIR)/*.h
 	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(HS_IDIR)
 
 $(LDIR)/libksw_estimator.so: $(EST_OBJECTS) $(LP_OBJECTS) $(IDIR)/ksw_estimator.h
-	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(OMPFLAG) $(OPTFLAG) -shared -o $@ $< $(LP_OBJECTS) $(LINK_COMMON) $(LINK_FFTW) $(LINK_MKL) -lgomp
+	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(CFLAGS_FFTW) $(OMPFLAG) $(OPTFLAG) -shared -o $@ $< $(LP_OBJECTS) $(LINK_COMMON) $(LINK_FFTW) $(LINK_MKL) -lgomp
 
 $(LDIR)/libksw_fisher.so: $(FIS_OBJECTS) $(LP_OBJECTS) $(IDIR)/ksw_fisher.h
-	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(OMPFLAG) $(OPTFLAG) -shared -o $@ $< $(LP_OBJECTS) $(LINK_COMMON) $(LINK_FFTW) $(LINK_MKL) -lgomp
+	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(CFLAGS_FFTW) $(OMPFLAG) $(OPTFLAG) -shared -o $@ $< $(LP_OBJECTS) $(LINK_COMMON) $(LINK_FFTW) $(LINK_MKL) -lgomp
 
 $(ODIR)/estimator.o: $(SDIR)/estimator.c $(IDIR)/*.h
-	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) $(LINK_COMMON) $(LINK_FFTW) $(LINK_MKL) -lgomp
+	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(CFLAGS_FFTW) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR)
 
 $(ODIR)/fisher.o: $(SDIR)/fisher.c $(IDIR)/*.h
-	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) $(LINK_COMMON) $(LINK_MKL)
+	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) $(LINK_COMMON)
 
 $(ODIR)/ylmgen_c.o: $(LP_SDIR)/ylmgen_c.c $(LP_IDIR)/*.h
-	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) $(LINK_COMMON) 
+	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR)
 
 $(ODIR)/c_utils.o: $(LP_SDIR)/c_utils.c $(LP_IDIR)/*.h
-	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) $(LINK_COMMON) 
+	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR)
 
 $(ODIR)/walltime_c.o: $(LP_SDIR)/walltime_c.c $(LP_IDIR)/*.h
-	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) $(LINK_COMMON) 
+	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR)
 
 check: check_c check_python
 
@@ -93,7 +94,7 @@ check_python:
 	cd $(TDIR); python -m pytest python/
 
 $(TDIR)/bin/run_tests: $(TEST_OBJECTS) $(LDIR)/libradial_functional.so $(LDIR)/libksw_estimator.so $(LDIR)/libksw_fisher.so
-	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -o $@ $(TEST_OBJECTS) -L$(LDIR) -lradial_functional -lksw_estimator -lksw_fisher $(LINK_COMMON) $(LINK_FFTW) $(LINK_MKL) -lgomp -Wl,-rpath,$(LDIR)
+	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(CFLAGS_FFTW) $(OMPFLAG) $(OPTFLAG) -o $@ $(TEST_OBJECTS) -L$(LDIR) -lradial_functional -lksw_estimator -lksw_fisher $(LINK_COMMON) $(LINK_FFTW) $(LINK_MKL) -lgomp -Wl,-rpath,$(LDIR)
 
 $(TDIR)/obj/run_tests.o: $(TDIR)/src/run_tests.c $(TDIR)/include/seatest.h
 	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(TDIR)/include -I$(IDIR) -I$(LP_IDIR)
@@ -105,10 +106,10 @@ $(TDIR)/obj/test_radial_functional.o: $(TDIR)/src/test_radial_functional.c $(IDI
 	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(HS_IDIR) -I$(TDIR)/include
 
 $(TDIR)/obj/test_estimator.o: $(TDIR)/src/test_estimator.c $(IDIR)/*.h
-	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) -I$(TDIR)/include
+	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(CFLAGS_FFTW) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) -I$(TDIR)/include
 
 $(TDIR)/obj/test_fisher.o: $(TDIR)/src/test_fisher.c $(IDIR)/*.h
-	$(CC) $(CFLAGS) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) -I$(TDIR)/include
+	$(CC) $(CFLAGS) $(CFLAGS_MKL) $(CFLAGS_FFTW) $(OMPFLAG) $(OPTFLAG) -c -o $@ $< -I$(IDIR) -I$(LP_IDIR) -I$(TDIR)/include
 
 clean:
 	rm -rf $(ODIR)
diff --git a/README.md b/README.md
index d06ccda..d9b8668 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,10 @@ An implementation of the Komatsu-Spergel-Wandelt bispectrum estimator for modern
 
 Start by making sure the MKL library is loaded in your environment. On most clusters this can be achieved by loading a predefined module. On the Princeton `della` and `tiger` clusters you can use `module load intel-mkl` (see [here](https://researchcomputing.princeton.edu/faq/how-to-build-using-intel-mkl) for more information). On `NERSC` you can use `load intel` (see [here](https://docs-dev.nersc.gov/cgpu/software/math/)). Once you have loaded the module, check if the `MKLROOT` environment variable has been set (`echo $MKLROOT`).
 
+Then do the same for the FFTW library. Right now, the Makefile assumes that the `FFTW_HOME` environment variable is set. So that make sure that is the case and that `FFTW_HOME/lib` and `FFTW_HOME/include` point to the correct directories.
 
 
-Once the MKL environment has been set, `git clone` this repository, go into the directory and run:
+Once the MKL and FFTW environments have been set, `git clone` this repository, go into the directory and run:
 
 
 ```
diff --git a/cython/radial_functional.pxd b/cython/radial_functional.pxd
index 0a1f10e..ebbc849 100644
--- a/cython/radial_functional.pxd
+++ b/cython/radial_functional.pxd
@@ -10,3 +10,4 @@ cdef extern from "radial_functional.h":
 			 int nr,
 			 int npol,
 			 int ncomp);
+
diff --git a/cython/radial_functional.pyx b/cython/radial_functional.pyx
index 6c205b2..8883e89 100644
--- a/cython/radial_functional.pyx
+++ b/cython/radial_functional.pyx
@@ -70,8 +70,9 @@ def radial_func(f_k, tr_ell_k, k, radii, ells):
                          npol,
                          ncomp)
     return f_ell_r
+
     
-def check_and_return_shape(arr, exp_shape):
+def check_and_return_shape(arr, exp_shape):    
     '''
     Return shape of input array if it matches expections.
 
@@ -109,3 +110,91 @@ def check_and_return_shape(arr, exp_shape):
                     idx, n_exp, n))            
     return arr.shape
 
+
+def radial_func_dL(f_k, tr_ell_k, k, radii, ells):
+    '''
+    Compute f_ell^X(r) = (2/pi) int k^2 dk f(k) transfer^X_ell(k) j_ell(k r),
+    where f(k) is an arbitrary function of wavenumber k.
+
+    Parameters
+    ---------
+    f_k : (nk, ncomp) array
+        Input functions.
+    tr_ell_k : (nell, nk, npol) array
+        Transfer functions.
+    k : (nk) array
+        Wavenumbers in 1/Mpc.
+    radii : (nr) array
+        Multipoles
+    
+    Returns
+    -------
+    f_ell_r : (nr, nell, npol, ncomp, delta_L_list) array
+        Evaluated integral for all radii, multipoles, polarizations 
+        and input function components.
+
+    Raises
+    ------
+    ValueError
+        If input array shapes are incorrect.
+        If the input arrays contain nans or infs.
+    '''
+
+    # Check input for nans and infs.
+    f_k = np.asarray_chkfinite(f_k, dtype=float, order='C')
+    tr_ell_k = np.asarray_chkfinite(tr_ell_k, dtype=float, order='C')
+    k = np.asarray_chkfinite(k, dtype=float, order='C')
+    radii = np.asarray_chkfinite(radii, dtype=float, order='C')
+    ells = np.asarray_chkfinite(ells, dtype=np.int32, order='C')
+    delta_L_list = np.asarray_chkfinite([-2, -1, 0, 1, 2], dtype=np.int32, order='C')
+
+    # Check for input shapes.
+    nk, = check_and_return_shape(k, (None,))
+    nr, = check_and_return_shape(radii, (None,))
+    nell, = check_and_return_shape(ells, (None,))
+    nk, ncomp = check_and_return_shape(f_k, (nk, None))
+    nell, nk, npol = check_and_return_shape(tr_ell_k, (nell, nk, None))
+    ndeltaL, = check_and_return_shape(delta_L_list, (None,))
+
+    # Create output array.
+    f_ell_r = np.empty((ndeltaL, nr, nell, npol, ncomp), dtype=float)
+
+    cdef double [::1] f_k_ = f_k.reshape(-1)
+    cdef double [::1] tr_ell_k_ = tr_ell_k.reshape(-1)
+    cdef double [::1] k_ = k.reshape(-1)
+    cdef double [::1] radii_ = radii.reshape(-1)
+    cdef double [::1] f_ell_r_ = f_ell_r.reshape(-1)
+    cdef int [::1] ells_= ells
+
+    # Define local variables
+    cdef Py_ssize_t i, index        # loop indices (Py_ssize_t = signed size type)
+    cdef int L, delta_L             # temporary variables for ell and its shifted version
+    
+    # Preallocate a temporary buffer to hold valid shifted ell values   
+    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] L_arr = np.empty(nell, dtype=np.int32)
+    cdef int[::1] L_buf = L_arr    # contiguous int memoryview (to pass to the C function)
+
+    # Loop over each delta_L value
+    for index in range(ndeltaL):
+        delta_L = delta_L_list[index]
+
+        # build the shifted ell list (L = ell+delta_L), excluding negative results
+        for i in range(nell):
+            L = ells_[i] + delta_L
+            if L >= 0:
+                L_buf[i] = L
+            else:
+                L_buf[i] = 0      
+                
+        radial_func = compute_radial_func(&f_k_[0],
+                                         &tr_ell_k_[0],
+                                         &k_[0],
+                                         &radii_[0],
+                                         &f_ell_r_[index * (nr * nell * npol * ncomp)],
+                                         &L_buf[0],
+                                         nk,
+                                         nell,
+                                         nr,
+                                         npol,
+                                         ncomp)
+    return f_ell_r
\ No newline at end of file
diff --git a/include/radial_functional.h b/include/radial_functional.h
index f58a81e..df9584a 100644
--- a/include/radial_functional.h
+++ b/include/radial_functional.h
@@ -81,3 +81,37 @@ void _trapezoidal_weights(double const *k,
  */
 
 void * _malloc_checked(size_t size);
+
+
+/* 
+* Compute f_ell^X(r) = (2/pi) int k^2 dk f(k) transfer^X_ell(k) j_ell(k r)
+* where f(k) is an arbitrary function of wavenumber k.
+* 
+* Arguments
+* ---------
+* f_k      : (nk * ncomp) input functions.
+* tr_ell_k : (nell * nk * npol) transfer function.
+* k        : (nk) array of monotonically increasing wavenumbers.
+* radii    : (nr) array of radii
+* f_r_ell  : (nr * nell * npol * ncomp) output array.
+* ells     : (nell) multipoles.
+* nk       : Number of wavenumbers.
+* nell     : Number of multipoles.
+* nr       : Number of radii.
+* npol     : Number of polarization components (1=T, 2= E, 3=B).
+* ncomp    : Number of input functions.
+* delta_l  : shift in ell for transfer functions
+*/
+
+void compute_radial_func_dL2(double const *f_k,
+                            double const *tr_ell_k,
+                            double *k,
+                            double const *radii,
+                            double *f_r_ell,
+                            int *ells,
+                            int nk,
+                            int nell,
+                            int nr,
+                            int npol,
+                            int ncomp,
+                            int delta_l);      // New: delta ell
diff --git a/ksw/Afunctionals.py b/ksw/Afunctionals.py
new file mode 100644
index 0000000..9441f3a
--- /dev/null
+++ b/ksw/Afunctionals.py
@@ -0,0 +1,196 @@
+import numpy as np
+from ducc0.misc import wigner3j_int
+from ksw import utils, estimator_core
+from estimator import KSW
+
+ksw = KSW.__new__(KSW)
+ksw.lmax = 100
+ksw.dtype = np.float32
+ksw.cdtype = np.complex64
+ksw.pol = ["T", "E"]
+
+
+#deltaL_list = {-1, +1} for scalar
+#deltaL_list = {-2, -1, 0, +1, +2} for tensor 
+ 
+   
+def wigner_J(S, L_list, deltaL_list, Jindex):
+    L_list = np.asarray(L_list, dtype=int)
+    deltaL_list = np.asarray(deltaL_list, dtype=int)
+
+    out = np.zeros((len(L_list), len(deltaL_list)), dtype=np.float64)
+
+    cJ = np.zeros((len(L_list), len(deltaL_list)), dtype=np.float64)
+    for iL, L in enumerate(L_list):
+        for idL, dL in enumerate(deltaL_list):
+            ell = L + dL
+            if ell < 0:
+                continue
+
+            # --- selection rules ---
+            if abs(Jindex[1]) > L:
+                continue
+                if abs(Jindex[2]) > ell:
+                    continue
+
+            l1_min_J, vals_J = wigner3j_int(L, ell, Jindex[1], Jindex[2])
+            iS = S - l1_min_J
+            if 0 <= iS < len(vals_J):
+                out[iL, idL] = vals_J[iS]
+    return out
+
+def w3j(S, n, L_list, deltaL_list):
+
+    L_list = np.asarray(L_list, dtype=int)
+    deltaL_list = np.asarray(deltaL_list, dtype=int)
+
+    Lmax = np.max(L_list)
+    out = np.zeros((len(L_list), 2*Lmax+1, len(deltaL_list)), dtype=np.float64)
+
+    for idL, dL in enumerate(deltaL_list):
+        for iL, L in enumerate(L_list):
+            ell = L + dL
+            if ell < 0:
+                continue
+
+            for M in range(-L, L+1):
+                m = - M - n
+                if abs(m) > ell:
+                    continue
+
+                # --- selection rules ---
+                if abs(M) > L:
+                    continue
+                if abs(m) > ell:
+                    continue
+
+                l1_min, vals = wigner3j_int(L, ell, M, m)
+                iS = S - l1_min
+                if 0 <= iS < len(vals):
+                    out[iL, M + Lmax, idL] = vals[iS]
+    return out
+
+def products_3j_array(S, n , L_list, deltaL_list, Jindex):
+    """
+    Compute the product of two 3j symbols
+    J_(S, L, L+deltaL)^(Jindex) * (S L L+deltaL; n M m)
+    Parameters
+    ----------
+    S : int
+        Total angular momentum quantum number
+    n: int
+        Magnetic quantum number
+    L_list : int
+        Orbital angular momentum quantum number
+    deltaL_list : int
+        choices for ell
+    Jindex: an array of shape(3, )
+        The indices (n, M, m) for the first 3j symbol
+
+    Returns:
+    -------
+    np.ndarray
+        Array of products of 3j symbols with shape(N_L, N_M, N_deltaL_list)
+    """
+    L_list = np.asarray(L_list, dtype=int)
+    Lmax = int(np.max(L_list))
+
+    w3j_J = wigner_J(S, L_list, deltaL_list, Jindex)
+    w3j_vals = w3j(S, n, L_list, deltaL_list)
+
+    out = np.zeros((len(L_list), 2*Lmax+1, len(deltaL_list)), dtype=np.float64)
+
+    for iL in range(len(L_list)):
+        for idL in range(len(deltaL_list)):
+            out[iL, :, idL] = w3j_J[iL, idL] * w3j_vals[iL, :, idL]
+    return out
+
+
+def parity_x(x):
+    """
+    Map x in {T, E, B} to parity code:
+    0 for T/E (parity even)
+    1 for B (parity odd)
+    """
+    x = x.upper()
+    if x == "T" or x == "E":
+        return 0
+    if x == "B":
+        return 1
+    
+def gamma_Z(x, Z, L, deltaL):
+    """
+    gamma^{(Z)}_{x, L, ell}:
+      = 1,                        if Z = zeta
+      = 1 + (-1)^{p_x + L + ell}, if Z = h
+
+    where p_x = 0 for T/E and 1 for B.
+    """
+    Zlow = Z.lower()
+    if Zlow == "zeta":
+        return 1
+    if Zlow == "h":
+        px = parity_x(x)
+        sgn = 1 + (-1) ** (px + 2 * L + deltaL)
+        return sgn
+
+def get_a_lm(alm, ell, m):
+    """
+    access a_{lm} assuming a_ell_m stores m>=0 in the last axis.
+
+    If m < 0, use reality condition:
+    a_{ell, -m} = (-1)^m * conj(a_{ell, m})
+    """
+    alm = utils.alm_return_2d(alm, ksw.npol, ksw.lmax)
+    a_ell_m = utils.alm2a_ell_m(alm)
+    a_ell_m = a_ell_m.astype(ksw.cdtype)
+
+    if m >= 0:
+        return a_ell_m[ell, m]
+    else:
+        mp = -m
+        return ((-1)**mp) * np.conjugate(a_ell_m[ell, mp])
+
+# thetas.max() = np.pi
+
+def A_LM(deltaL_list, L_list, n, x, Z, S, Jindex, alm, theta_batch=1):
+
+    Lmax = np.max(L_list)
+    alm_list = np.zeros((len(L_list), 2*Lmax+1, len(deltaL_list)), dtype=np.complex128)
+    out = np.zeros((len(L_list), 2*Lmax+1, len(deltaL_list)), dtype=np.complex128)
+
+    w3j_product = products_3j_array(S, n, L_list, deltaL_list, Jindex)
+
+    thetas, theta_weights, nphi = KSW.get_coords(ksw)
+    tidx_start = 20
+    thetas_batch = thetas[tidx_start:tidx_start+theta_batch]
+    y_m_ell = estimator_core.compute_ylm(thetas_batch, ksw.lmax, dtype=ksw.dtype)
+
+    for iL, L in enumerate(L_list):
+        for idL, dL in enumerate(deltaL_list):
+            ell = L + dL
+            gamma = gamma_Z(x, Z, L, dL)
+            phase = (1j) ** dL
+            if ell < 0:
+                continue
+            for M in range(-L, L+1):
+                m = - M - n
+                if abs(m) > ell:
+                    continue
+                if abs(M) > L:
+                    continue
+                alm = get_a_lm(alm, ell=L+dL, m=-M-n) 
+                alm_list[iL, M+Lmax, idL] = alm
+                # prefactors independent of M
+                prefactor = phase * w3j_product[iL, Lmax+M, idL] * gamma
+                out[iL, M+Lmax, idL] = prefactor * alm * y_m_ell[:, M+Lmax, iL]
+    return out
+
+
+
+
+
+
+    
+    
+    
\ No newline at end of file
diff --git a/ksw/cosmo.py b/ksw/cosmo.py
index 32893d2..50c699d 100644
--- a/ksw/cosmo.py
+++ b/ksw/cosmo.py
@@ -6,7 +6,8 @@
 import camb
 import h5py
 
-from ksw import utils, radial_functional as rf
+from ksw import utils
+from ksw import radial_functional as rf
 
 class Cosmology:
     '''
@@ -74,7 +75,7 @@ def __init__(self, camb_params, verbose=False):
                           verbose=verbose)
         self._setattr_camb('IntkAccuracyBoost', 5, subclass='Accuracy',
                           verbose=verbose)
-        self._setattr_camb('lSampleBoost', 2, subclass='Accuracy',
+        self._setattr_camb('lSampleBoost', 2, subclass='Accuracy', 
                           verbose=verbose)
         self._setattr_camb('lAccuracyBoost', 2, subclass='Accuracy',
                           verbose=verbose)
@@ -142,7 +143,7 @@ def _setattr_camb(self, name, value, subclass=None, verbose=True):
                 'New value {} for param {} makes params invalid.'.format(
                     value, name))
 
-    def compute_transfer(self, lmax, verbose=True):
+    def compute_transfer(self, lmax, k_eta_fac=2.5, verbose=True):
         '''
         Call CAMB to calculate radiation transfer functions.
 
@@ -165,7 +166,7 @@ def compute_transfer(self, lmax, verbose=True):
             # CAMB crashes for too low lmax.
             raise ValueError('Pick lmax >= 300.')
 
-        k_eta_fac = 2.5 # Default used by CAMB.
+        #k_eta_fac = 2.5 # Default used by CAMB.
         self.camb_params.set_for_lmax(lmax, lens_margin=0,
                                       k_eta_fac=k_eta_fac)
 
@@ -211,6 +212,79 @@ def compute_transfer(self, lmax, verbose=True):
         self.transfer['k'] = tr.q
         self.transfer['ells'] = ells # Probably sparse.
 
+    def compute_transfer_tensor(self, lmax, k_eta_fac=2.5, verbose=True):
+        """
+        Call CAMB to calculate tensor transfer functions.
+
+        Parameters
+        ----------
+        lmax : int
+            Maximum multipole.
+        Verbose : bool, optional
+            Print if CAMB parameter values change.
+
+        Raises
+        ------
+        AttributeError
+            If CAMB parameters have not been initialized.
+        ValueError
+            If lmax is too low (lmax < 300).
+        """
+
+        if lmax < 300:
+            raise ValueError("Pick lmax >= 300.")
+        
+        self._setattr_camb('WantTensors', True, verbose=verbose)
+        lmax = max(lmax, 300)
+        #k_eta_fac = 2.5
+        max_eta_k = k_eta_fac * lmax
+        max_eta_k = max(max_eta_k, 1000)
+
+        self.camb_params.max_l = lmax
+        self.camb_params.max_l_tensor = lmax
+        self.camb_params.max_eta_k = max_eta_k
+        self.camb_params.max_eta_k_tensor = max_eta_k
+
+        if not self.camb_params.validate():
+            raise ValueError(f"Value {lmax} for lmax makes params invalid")
+
+        # Actually run CAMB
+        data = camb.get_transfer_functions(self.camb_params)
+        self._camb_data = data
+
+        # tensor instead of scalar
+        tr = data.get_cmb_transfer_data('tensor')
+        try:
+            ells = tr.L
+        except AttributeError:
+            ells = tr.l
+        ells = ells.astype(int)
+
+        prefactor = np.sqrt((ells + 2) * (ells + 1) * ells * (ells - 1))
+        tr.delta_p_l_k[0,...] *= prefactor[:,np.newaxis]
+
+        tr.delta_p_l_k *= (self.camb_params.TCMB * 1e6)
+
+        # Scale tensor transfer by 1/4 to correct for CAMB output.
+        tr.delta_p_l_k /= 4.
+
+        nk = tr.q.size
+        nell = ells.size
+        # For tensor we have (T, E, B)
+        npol = tr.delta_p_l_k.shape[0]
+        tr_ell_k = np.empty((nell, nk, npol), dtype=float)
+
+        tr_view = np.swapaxes(tr.delta_p_l_k, 0, 2) # (nk, nell, npol)
+        tr_view = np.swapaxes(tr_view, 0, 1)        # (nell, nk, npol)
+        tr_ell_k[:] = np.ascontiguousarray(tr_view)
+
+        self.transfer['tr_ell_k_tensor'] = tr_ell_k
+        self.transfer['k_tensor'] = tr.q
+        self.transfer['ells_tensor'] = ells
+    
+        self._setattr_camb('WantTensors', False, verbose=verbose)
+
+
     def compute_c_ell(self, lmax=None):
         '''
         Calculate angular power spectra (Cls) using precomputed
@@ -252,6 +326,7 @@ def compute_c_ell(self, lmax=None):
         self.c_ell['lenspotential']['ells'] = ells_lenspotential
         self.c_ell['lenspotential']['c_ell'] = c_ell_lenspotential
         
+        
     def add_prim_reduced_bispectrum(self, prim_shape, radii, name=None):
         '''
         Compute the factors of the reduced bispectrum for a given
@@ -300,7 +375,7 @@ def add_prim_reduced_bispectrum(self, prim_shape, radii, name=None):
         amps = np.asarray(prim_shape.amps)
         amps *= 2 * (2 * np.pi ** 2 * self.camb_params.InitPower.As) ** 2 * (3 / 5)
 
-        # Call C code.
+        # Call C code
         red_bisp = rf.radial_func(f_k, tr_ell_k, k, radii, ells_sparse)
 
         factors, rule, weights = self._parse_prim_reduced_bispec(
@@ -1149,7 +1224,7 @@ def write(self, filename):
             f.create_dataset('rule', data=self.rule)
             f.create_dataset('weights', data=self.weights)
             f.create_dataset('ells_full', data=self.ells_full)
-            f.create_dataset('name', data=np.string_(self.name))
+            f.create_dataset('name', data=np.bytes_(self.name))
 
     @classmethod
     def init_from_file(cls, filename):
diff --git a/ksw/estimator.py b/ksw/estimator.py
index 7c04812..55de234 100644
--- a/ksw/estimator.py
+++ b/ksw/estimator.py
@@ -78,6 +78,8 @@ def __init__(self, red_bispectra, icov, lmax, pol, precision='single'):
         self.mc_gt = None
         self.mc_gt_sq = None
 
+        self.gt_local = []
+        
         self.lmax = lmax
         self.pol = pol
 
@@ -374,6 +376,104 @@ def step_batch(self, alm_loader, alm_files, comm=None, verbose=False, **kwargs):
                 
         self.mc_idx += mc_idx
 
+    def step_batch_2pass(self, alm_loader, alm_files, comm=None, verbose=False, **kwargs):
+        '''
+        Compute grad T its mean for a set of simulations, by loading and processing
+        several alms in parallel using MPI.
+
+        Parameters
+        ----------
+        alm_loader : callable
+            Function that returns alms on rank given filename as first argument.
+        alm_files : array_like
+            List of alm files to load.
+        comm : MPI communicator, optional
+        verbose : bool, optional
+            Print process.
+        kwargs : dict, optional
+            Optional keyword arguments passed to "_step".        
+        '''
+
+        if comm is None:
+            comm = utils.FakeMPIComm()
+
+        # Monte Carlo quantities local to rank.
+        mc_idx_loc = 0
+        mc_gt_loc = None
+
+        # Split alm_file loop over ranks.
+        for alm_file in alm_files[comm.Get_rank():len(alm_files):comm.Get_size()]:
+
+            if verbose:
+                print(f'rank {comm.rank:3}: loading {alm_file}')
+            alm = alm_loader(alm_file)
+            if verbose:
+                print(f'rank {comm.rank:3}: done loading')
+            grad_t = self._step(alm, **kwargs)
+
+            if mc_gt_loc is None:
+               mc_gt_loc = grad_t
+            else:
+               mc_gt_loc += grad_t
+
+            # # NOTE
+            # if mc_gt_loc is None:
+            #    mc_gt_loc = grad_t.copy()
+            # else:
+            #    mc_gt_loc += grad_t.copy()
+
+            # The copy here is important, otherwise each iteration of the
+            # loop adds grad to the first element of the list.
+            self.gt_local.append(grad_t.copy())
+            
+            mc_idx_loc += 1
+
+        print(f'rank : {comm.rank:3} waiting in step_batch')
+        # To allow allreduce when number of ranks > alm files.
+        shape, dtype = utils.bcast_array_meta(mc_gt_loc, comm, root=0)
+        if mc_gt_loc is None: mc_gt_loc = np.zeros(shape, dtype=dtype)
+        if mc_idx_loc is None: mc_idx_loc = 0
+
+        mc_gt = utils.allreduce_array(mc_gt_loc, comm)
+        mc_idx = utils.allreduce(mc_idx_loc, comm)
+        print(f'rank : {comm.rank:3} after reduce in step_batch')
+
+        # All ranks get to update the internal mc variable themselves.
+        if self.mc_gt is None:
+            self.mc_gt = mc_gt
+        else:
+            self.__mc_gt += mc_gt
+
+        self.mc_idx += mc_idx
+
+    def compute_fisher_2pass(self, comm):
+        '''
+        Compute the Fisher information from the grad T maps computed
+        on all ranks.
+
+        Returns
+        -------
+        fisher : float, None
+            Fisher information.
+        '''
+
+        if self.mc_gt is None:
+            return None
+
+        fisher_local = 0.
+        for gidx, gt in enumerate(self.gt_local):
+
+            diff = gt - self.mc_gt
+            diff_icov = self.icov(diff)
+            dot = utils.contract_almxblm(diff, np.conj(diff_icov))
+            print(f'{comm.rank=}, {gidx=}, fisher estimate={dot / 3}')
+            fisher_local += dot
+
+        fisher = utils.allreduce(fisher_local, comm)
+        fisher /= (3 * self.mc_idx)
+
+        return fisher
+        
     def compute_estimate_batch(self, alm_loader, alm_files, comm=None, 
                                verbose=False, **kwargs):
         '''
@@ -411,8 +511,6 @@ def compute_estimate_batch(self, alm_loader, alm_files, comm=None,
         lin_terms = np.zeros(len(alm_files))
         fishers = np.zeros(len(alm_files))        
 
-        # NOTE, silly to recompute fisher for each aidx.
-        
         # Split alm_file loop over ranks.
         for aidx in range(comm.Get_rank(), len(alm_files), comm.Get_size()):
         
@@ -709,7 +807,127 @@ def compute_ng_sim_batch(self, alm_loader, alm_files, alm_writer,
 
             if verbose:
                 print(f'rank {comm.rank:3}: done writing {oalm_file}')
-            
+
+    def write_state_2pass(self, filename, fisher, comm=None):
+        '''
+        Write internal state, i.e. mc_gt, fisher and mc_idx, for the 2pass-mode
+        of the code.
+
+        Parameters
+        ----------
+        filename : str
+            Absolute path to output file.
+        fisher : float
+            Output from compute_fisher_2pass.
+        comm : MPI communicator, optional
+            If provided, rank 0 is assumed to do the writing, so must be present.
+        '''
+
+        if comm is None:
+            comm = utils.FakeMPIComm()
+
+        if comm.Get_rank() == 0:
+            # Remove file extension to be consistent.
+            filename, _ = os.path.splitext(filename)
+
+            mc_idx_to_save = np.asarray([self.mc_idx], dtype=np.int64)
+
+            fisher_to_save = np.asarray([fisher], dtype=np.float64)            
+
+            if self.__mc_gt is None:
+                mc_gt_to_save = np.asarray([np.nan], dtype=self.cdtype)
+            else:
+                mc_gt_to_save = self.__mc_gt
+
+            with h5py.File(filename + '.hdf5', 'w') as f:
+                f.create_dataset('mc_idx', data=mc_idx_to_save)
+                f.create_dataset('fisher', data=fisher_to_save)
+                f.create_dataset('mc_gt', data=mc_gt_to_save)
+
+    def _read_state_2pass(self, filename, comm=None):
+        '''
+        Read internal state, i.e. mc_gt, fisher and mc_idx, from hdf5 file.
+
+        Parameters
+        ----------
+        filename : str
+            Absolute path to output file.
+        comm : MPI communicator, optional
+            If provided, rank 0 is assumed to do the reading, result will be 
+            broadcasted to all ranks.
+
+        Returns
+        -------
+        mc_idx : int
+            Counter for Monte Carlo estimates.
+        fisher : float, None
+            Fisher information.
+        mc_gt : (npol, nelem) complex array, None
+            <grad T (C^-1 a)> Monte Carlo estimate.
+        '''
+
+        if comm is None:
+            comm = utils.FakeMPIComm()
+
+        if comm.Get_rank() == 0:
+            # Remove file extension to be consistent.
+            filename, _ = os.path.splitext(filename)
+
+            with h5py.File(filename + '.hdf5', 'r') as f:
+                mc_idx_read = f['mc_idx'][()]
+                fisher_read = f['fisher'][()]
+                mc_gt_read = f['mc_gt'][()]
+
+            assert mc_idx_read.size == 1, (f'mc_idx has to be single int, got '
+                                      f'{mc_idx.size}-sized array')
+            mc_idx_read = int(mc_idx_read[0])
+                        
+        else:
+            mc_idx_read = None
+            fisher_read = None
+            mc_gt_read = None
+
+        mc_idx_read = utils.bcast(mc_idx_read, comm, root=0)
+        fisher_read = utils.bcast_array(fisher_read, comm, root=0)
+        mc_gt_read = utils.bcast_array(mc_gt_read, comm, root=0)
+
+        if fisher_read.size == 1 and np.isnan(fisher_read)[0]:
+            fisher_read = None
+        else:
+            fisher_read = float(fisher_read[0])
+
+        if mc_gt_read.size == 1 and np.isnan(mc_gt_read)[0]:
+            mc_gt_read = None
+        
+        return mc_idx_read, fisher_read, mc_gt_read
+    
+    def start_from_read_state_2pass(self, filename, comm=None):
+        '''
+        Return Fisher information and update estimator state with
+        mc_gt and mc_idx read from .hdf5 file.
+
+        Parameters
+        ----------
+        filename : str
+            Absolute path to output file.
+        comm : MPI communicator, optional
+            If provided, rank 0 is assumed to do the reading, result will be 
+            broadcasted to all ranks.
+
+        Returns
+        -------
+        fisher : float
+            Fisher information.
+        '''
+        
+        mc_idx_read, fisher_read, mc_gt_read = self._read_state_2pass(
+            filename, comm=comm)
+        
+        self.mc_idx = mc_idx_read
+        self.__mc_gt = mc_gt_read
+
+        return fisher_read
+                
     def write_state(self, filename, comm=None):
         '''
         Write internal state, i.e. mc_gt, mc_gt_sq and mc_idx, to hdf5 file.
diff --git a/tests/python/test_cosmology.py b/tests/python/test_cosmology.py
index a463566..a998a3f 100644
--- a/tests/python/test_cosmology.py
+++ b/tests/python/test_cosmology.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy as np
 from scipy.special import spherical_jn
+from scipy.integrate import trapezoid
 import os
 import tempfile
 import pathlib
@@ -36,7 +37,7 @@ def test_cosmology_init(self):
         self.assertIs(cosmo.camb_params.DoLensing, True)
         self.assertEqual(cosmo.camb_params.Accuracy.AccuracyBoost, 2)
         self.assertEqual(cosmo.camb_params.Accuracy.lSampleBoost, 2)
-        self.assertEqual(cosmo.camb_params.Accuracy.lAccuracyBoost, 2)
+        self.assertEqual(cosmo.camb_params.Accuracy.lAccuracyBoost, 2) 
         self.assertIs(cosmo.camb_params.Accuracy.AccurateBB, True)
         self.assertIs(cosmo.camb_params.Accuracy.AccurateReionization, True)
         self.assertEqual(cosmo.camb_params.Accuracy.BessIntBoost, 30)
@@ -125,6 +126,30 @@ def test_cosmology_compute_transfer(self):
         self.assertEqual(ells[0], 2)
         self.assertEqual(ells[-1], lmax)
 
+    def test_cosmology_compute_transfer_tensor(self):
+
+        lmax = 300
+        pars = camb.CAMBparams()
+        pars.set_cosmology(**self.cosmo_opts)
+
+        cosmo = Cosmology(pars)
+        cosmo.compute_transfer_tensor(lmax)
+
+        ells = cosmo.transfer['ells_tensor']
+        k = cosmo.transfer['k_tensor']
+        tr_ell_k = cosmo.transfer['tr_ell_k_tensor']
+
+        npol = 3   # tensor has T, E, B
+        nell = ells.size
+        nk = k.size
+
+        self.assertEqual(tr_ell_k.shape, (nell, nk, npol))
+        self.assertTrue(tr_ell_k.flags['C_CONTIGUOUS'])
+        self.assertTrue(tr_ell_k.flags['OWNDATA'])
+        self.assertEqual(ells[0], 2)
+        self.assertEqual(ells[-1], lmax)
+
+
     def test_cosmology_compute_transfer_err_value(self):
 
         lmax = 299 # Too low value.
@@ -227,13 +252,13 @@ def test_cosmology_compute_transfer_power(self):
         for lidx in range(ells.size):
             
             # TT.
-            c_ell[lidx,0] = np.trapz(k ** 2 * p_k * tr[lidx,:,0] ** 2, k)
+            c_ell[lidx,0] = trapezoid(k ** 2 * p_k * tr[lidx,:,0] ** 2, k)
 
             # EE.
-            c_ell[lidx,1]  = np.trapz(k ** 2 * p_k * tr[lidx,:,1] ** 2, k)
+            c_ell[lidx,1]  = trapezoid(k ** 2 * p_k * tr[lidx,:,1] ** 2, k)
 
             # TE.
-            c_ell[lidx,3] = np.trapz(k ** 2 * p_k * tr[lidx,:,0] * tr[lidx,:,1], k)
+            c_ell[lidx,3] = trapezoid(k ** 2 * p_k * tr[lidx,:,0] * tr[lidx,:,1], k)
 
         c_ell *= (2 / np.pi)
 
@@ -275,13 +300,13 @@ def test_cosmology_compute_transfer_power_ns(self):
         for lidx in range(ells.size):
             
             # TT.
-            c_ell[lidx,0] = np.trapz(k ** 2 * p_k * tr[lidx,:,0] ** 2, k)
+            c_ell[lidx,0] = trapezoid(k ** 2 * p_k * tr[lidx,:,0] ** 2, k)
 
             # EE.
-            c_ell[lidx,1]  = np.trapz(k ** 2 * p_k * tr[lidx,:,1] ** 2, k)
+            c_ell[lidx,1]  = trapezoid(k ** 2 * p_k * tr[lidx,:,1] ** 2, k)
 
             # TE.
-            c_ell[lidx,3] = np.trapz(k ** 2 * p_k * tr[lidx,:,0] * tr[lidx,:,1], k)
+            c_ell[lidx,3] = trapezoid(k ** 2 * p_k * tr[lidx,:,0] * tr[lidx,:,1], k)
 
         c_ell *= (2 / np.pi)
 
@@ -375,7 +400,7 @@ def test_cosmology_add_prim_reduced_bispectrum(self):
 
         integrand = k ** 2 * spherical_jn(ell, radius * k)
         integrand *= tr_ell_k[lidx,:,pidx] * func
-        ans_expec = (2 / np.pi) * np.trapz(integrand, k)
+        ans_expec = (2 / np.pi) * trapezoid(integrand, k)
 
         lidx_full = ell - 2
         ans = red_bisp.factors[cidx*nr+ridx,pidx,lidx_full]
diff --git a/tests/python/test_radial_func.py b/tests/python/test_radial_func.py
index dd6a1e3..d52e2c4 100644
--- a/tests/python/test_radial_func.py
+++ b/tests/python/test_radial_func.py
@@ -4,6 +4,7 @@
 import unittest
 import numpy as np
 from scipy.special import spherical_jn
+from scipy.integrate import trapezoid
 
 from ksw import radial_functional as rf
 
@@ -154,7 +155,7 @@ def test_radial_func(self):
         
         exp_f_ell_r = np.zeros((nr, nell, npol, ncomp), dtype=float)
         for ridx, rp in enumerate(radii_prime):            
-            exp_f_ell_r[0,0,0,ridx] = (2. / np.pi) * np.trapz(
+            exp_f_ell_r[0,0,0,ridx] = (2. / np.pi) * trapezoid(
              f_k[:,ridx] * spherical_jn(ell, k * radius) * k ** 2, k)
         
         f_ell_r = rf.radial_func(f_k, tr_ell_k, k, radii, ells)
diff --git a/tests/python/test_utils.py b/tests/python/test_utils.py
index feb8d1d..3241a1c 100644
--- a/tests/python/test_utils.py
+++ b/tests/python/test_utils.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy as np
 import healpy as hp
+from scipy.integrate import trapezoid
 from mpi4py import MPI
 
 from ksw import utils
@@ -17,7 +18,7 @@ def test_utils_get_trapz_weights(self):
 
         y = np.asarray([6, 3, 7, 3])
 
-        self.assertAlmostEqual(np.sum(y * dx), np.trapz(y, x))
+        self.assertAlmostEqual(np.sum(y * dx), trapezoid(y, x))
 
     def test_utils_get_trapz_weights_err(self):