Source code for pyatmlab.datasets.tanso

import collections

import numpy
import scipy.interpolate

import h5py

from .. import dataset
from .. import physics
from .. import math as pamath
from ..constants import ppm as PPM, hecto as HECTO

class TansoFTSBase(dataset.ProfileDataset):
[docs]    """Applicable to both Tanso FTS versions
    """
    p_for_T_profile = numpy.array([1000, 975, 950, 925, 900, 850, 800,
        700, 600, 500, 400, 300, 250, 200, 150, 100, 70, 50, 30, 20,
        10])*HECTO # hPa -> Pa

    A_needs_converting = False
    A_needs_swapping = False

    def _read_common(self, h5f):
        """From open h5 file, read some fields common to both tanso versions
        """

        D = collections.OrderedDict()
        time_raw = h5f["scanAttribute"]["time"]
        # force UTC time
        D["time"] = numpy.array([numpy.datetime64(time_raw[i].decode('ascii')+'Z')
            for i in range(time_raw.size)], dtype="datetime64[us]")
        D["lat"] = h5f["Data"]["geolocation"]["latitude"]
        D["lon"] = h5f["Data"]["geolocation"]["longitude"]
        D["z0"] = h5f["Data"]["geolocation"]["height"]
        D["T"] = h5f["scanAttribute"]["referenceData"]["temperatureProfile"]
        D["h2o"] = h5f["scanAttribute"]["referenceData"]["waterVaporProfile"][...]*PPM
        D["p0"] = h5f["scanAttribute"]["referenceData"]["surfacePressure"][...] * HECTO
        D["id"] = h5f["scanAttribute"]["scanID"]

        return D

class TansoFTSv10x(dataset.MultiFileDataset, TansoFTSBase):
[docs]    """Tanso FTS v1.00, 1.01
    """

    re = r"GOSATTFTS(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})_02P02TV010[01]R\d{6}[0-9A-F]{5}\.h5"
    aliases = {"CH4_profile": "ch4_profile",
               "ak": "ch4_ak",
               "delta_CH4_profile": "ch4_profile_e"}
    n_prof = "p"
    # For TANSO collocating with PEARL, 50% of profiles have > 40%
    # sensitivity between 5.3 km and 9.7 km.
    #
    # For TANSO collocating with ACE, 20% of profiles have > 30%
    # sensitivity between 5.2 km and 9.5 km.
    range = (5e3, 10e3)

    # NOTE: For Tanso FTS v1.0x, there are THREE pressure profiles:
    #
    # - CH4 has 23 levels: 1165.9, 857.7, 735.6, 631.0, 541.2, 464.2,
    #   398.1, 341.4, 287.3, 237.1, 195.7, 161.6, 133.4, 100.0, 75.0, 51.1,
    #   34.8, 23.7, 16.2, 10.0, 5.6, 1.0, 0.1 hPa (GOSAT Product Description)
    #
    # - For each retrieval, the average pressure in each of 22 layers is
    #   stored (this one varies)
    #
    # - For temperature and water vapour, there are 21 vertical levels:
    #   1000, 975, 950, 925, 900, 850, 800, 700, 600, 500, 400, 300, 250,
    #   200, 150, 100, 70, 50, 30, 20, 10 hPa.
    #
    # To get CH4 on z-grid rather than p-grid, need to convert p-grid to
    # z-grid.  This needs temperature and water vapour profiles.  Either
    # interpolate *and* extrapolate z-grid onto retrieval grid, or
    # interpolate retrieval *and* averaging kernel onto T/Q-grid.  Prefer
    # the former because ak-interpolation I found is linear which is OK
    # for z but not for p?
    #

    # implementation of abstract methods
    def _read(self, path, fields="all"):
        D = collections.OrderedDict()
        with h5py.File(path, 'r') as h5f:
            D.update(self._read_common(h5f))
            D["ch4_profile"] = (h5f["Data"]["originalProfile"]
                ["CH4Profile"][:]*PPM)
            D["ch4_profile_e"] = (h5f["Data"]["originalProfile"]
                ["CH4ProfileError"][:]*PPM)
            D["ch4_ak"] = h5f["Data"]["originalProfile"]["CH4AveragingKernelMatrix"]
            D["p"] = h5f["Data"]["originalProfile"]["pressure"][...]*HECTO
            D["z0"] = h5f["Data"]["geolocation"]["height"]
            A = numpy.empty(shape=D["time"].size,
                dtype=[(k, D[k].dtype, D[k].shape[1:]) for k in D.keys()])
            for k in D.keys():
                A[k] = D[k][:]
        return A if fields=="all" else A[fields]

    def get_z(self, obj, field=None):
[docs]        if field is None:
            try:
                return super().get_z(obj)
            except IndexError:
                pass # parent failed, continue here
        elif field != "T":
            raise ValueError("No special case for {}".format(field))
        # See comment near top of this class; different p-grids, different
        # z-grids.  Want z-grid corresponding to CH4-profile.
        p_for_T = self.p_for_T_profile # down up to 1 kPa
        p_for_CH4 = obj["p"] # down to ~50 Pa
        p_CH4_valid = numpy.isfinite(p_for_CH4)
        p_CH4_valid[p_CH4_valid] = p_for_CH4[p_CH4_valid] > 0
        p_for_CH4 = p_for_CH4[p_CH4_valid]
        p_T_valid = numpy.isfinite(p_for_T)
        p_T_valid[p_T_valid] = p_for_T[p_T_valid] > 0
        p_for_T = p_for_T[p_T_valid]
        p_valid = p_T_valid if field == "T" else p_CH4_valid
        # inter- and extra-polate T and h2o onto the p_for_CH4 grid
        # This introduces an extrapolation error but between 1 kPa and 50
        # Pa I don't really care, should be insignificant.

        # highest level is lowest pressure i.e. lower bound, and vice
        # versa.  Therefore reserve p, f(p) so p is going up (increasing
        # pressure)
        xb = min(p_for_CH4.min(), p_for_T.min())
        # including obj["p0"] here guarantees reference pressure/altitude
        # always included in profile, satisfying Patricks algorithm
        # (p2z_hydrostatic)
        # The purpose of this interpolation is to find temperature and
        # water vapour mixing ratio in order to convert pressure to
        # elevation.  Even setting water vapour mixing ratio completely
        # to 0 leads to sub-metre differences in elevation calculations.
        # This means that a "1-D spline" (i.e. linear interpolation in T
        # and log(vmr_h2o), k=1) is acceptable.
        xe = max(p_for_CH4.max(), p_for_T.max(), obj["p0"])
        tck_T = scipy.interpolate.splrep(p_for_T[::-1], obj["T"][::-1],
            xb=xb,
            xe=xe,
            k=1)
        tck_logh2o = scipy.interpolate.splrep(p_for_T[::-1], numpy.log(obj["h2o"][::-1]),
            xb=xb,
            xe=xe,
            k=1)

        # extrapolate, add extra pressure points.  Make sure that
        # p0 >= p_grid[0], also include 100 kPa as this was in the
        # original grid.  For completeness, merge both pressure grids,
        # then "unpack" later.  Sort by index so I can later extract the
        # parts that were belonging to p_for_CH4.
        #
        # FIXME: Do I really need to "unpack" it again?  Or just stick
        # with the merged grid?!
        p_full = []
        p_full.append(obj["p0"])
        p_for_T_i0 = len(p_full)
        p_full.extend(p_for_T)
        p_for_T_i1 = len(p_full)
        p_for_ch4_i0 = len(p_full)
        p_full.extend(p_for_CH4)
        p_for_ch4_i1 = len(p_full)
        p_full = numpy.array(p_full, dtype="f4")
        p_inds = numpy.argsort(p_full)[::-1] # high to low pressure

        # masked arrays go wrong within the p2z algorithm...
        # rather do masking "by hand" :(
        p_full_sorted = p_full[p_inds]
        ok = (numpy.isfinite(p_full_sorted) &
              (p_full_sorted > 0))
#        p_full_sorted = numpy.ma.masked_less(p_full_sorted, 0)
#        T_retgrid = numpy.ma.masked_array(
        T_retgrid = numpy.zeros_like(p_full_sorted)
        T_retgrid.fill(numpy.nan)
        T_retgrid[ok] = scipy.interpolate.splev(
            p_full_sorted[ok], tck_T, ext=1)#,
#            p_full_sorted.mask)
#        q_retgrid = numpy.ma.masked_array(
        logq_retgrid = numpy.zeros_like(p_full_sorted)
        logq_retgrid.fill(numpy.nan)
        logq_retgrid[ok] = scipy.interpolate.splev(
            p_full_sorted[ok], tck_logh2o, ext=1)#,
        q_retgrid = numpy.zeros_like(p_full_sorted)
        q_retgrid.fill(numpy.nan)
        q_retgrid[ok] = numpy.exp(logq_retgrid[ok])
#            p_full_sorted.mask)

        z_extp = numpy.zeros(shape=p_full_sorted.shape, dtype="f4")
        z_extp.fill(numpy.nan)
        ok = (ok &
              numpy.isfinite(T_retgrid) &
              numpy.isfinite(q_retgrid))
        ok[ok] = (ok[ok] & 
              (T_retgrid[ok] > 0) &
              (q_retgrid[ok] > 0))
        z_extp[ok] = physics.p2z_hydrostatic(
            p_full_sorted[ok], T_retgrid[ok], q_retgrid[ok],
            obj["p0"], obj["z0"], obj["lat"], -1)
        
        # return only elements belonging to p_for_CH4
        # this yields the "inverse" sorting indices to get back the
        # original order, but only for those that were part of p_for_CH4
        p_i_was_CH4 = ((p_inds>=p_for_ch4_i0) & (p_inds<p_for_ch4_i1)).nonzero()[0]
        p_i_was_T   = ((p_inds>=p_for_T_i0)   & (p_inds<p_for_T_i1)).nonzero()[0]

        z_ret = numpy.zeros(dtype="f4", 
            shape=(p_T_valid if field=="T" else p_CH4_valid).shape)
        z_ret[p_valid] = z_extp[p_i_was_T if field=="T" else p_i_was_CH4]
        z_ret[~p_valid] = numpy.nan
        return z_ret
        #return z_extp[p_i_was_CH4]#.data # no masked array...
#        inv_inds[-len(p_for_CH4):]]

#        z_for_T = physics.p2z_hydrostatic(
#            self.p_for_T_profile, obj["T"], obj["h2o"], 
#            obj["p0"], obj["z0"], obj["lat"], -1)
    
    def get_z_for(self, obj, field):
[docs]        return self.get_z(obj, field)

    def get_time_from_granule_contents(self, p):
[docs]        M = self.read(p, fields={"time"})
        return (M["time"].min(), M["time"].max())
            

class TansoFTSv001(dataset.SingleFileDataset, TansoFTSBase):
[docs]    # Retained for historical reasons

#    start_date = datetime.datetime(2010, 3, 23, 2, 24, 54, 210)
#    end_date = datetime.datetime(2010, 10, 31, 20, 34, 50, 814)
#    srcfile = ("/home/gerrit/sshfs/glacier/data/1/gholl/data/1403050001/"
#               "GOSATTFTS20100316_02P02TV0001R14030500010.h5")
#    name = "GOSAT Tanso FTS"

    p_for_interp_profile = numpy.array([1000, 700, 500, 300, 100, 50,
        10])*HECTO
    aliases = {"CH4_profile": "ch4_profile_raw"}

    @classmethod
    def vmr_to_column_density(cls, data):
[docs]        """Convert VMR to column density

        Currently hardcoded to CH4 profiles and the dtype by this classes'
        read method.

        Returns number column density and its error

        Note that a small error may be introduced if profiles do not go
        down to the reference provided by (z0, p0), because the p2z
        function needs a reference inside the p-profile.  This is
        mitigated by adding a dummy level with a dummy temperature, then
        removing this after the z-profile is calculated.  This has a minor
        (<0.1%) effect on the remaining levels.
        """
        # temperature interpolated on each pressure grid
        ncd = numpy.empty(shape=data.shape, dtype="f8")
        ncd_e = numpy.empty_like(ncd)

        for n in range(data.size):
            if data["T"].mask.size == 1:
                T = data["T"][n, ::-1].data
            else:
                T = data["T"][n, ::-1].data[~data["T"][n, ::-1].mask]

            if data["h2o"].mask.size == 1:
                h2o = data["h2o"][n, ::-1].data
            else:
                h2o = data["h2o"][n, ::-1].data[~data["h2o"][n, ::-1].mask]

            T_interpolator = scipy.interpolate.interp1d(
                cls.p_for_T_profile[::-1], T)# should be increasing

            h2o_interpolator = scipy.interpolate.interp1d(
                cls.p_for_T_profile[::-1], h2o)
            
            p = data["p_raw"][n, :].data[~data["p_raw"][n, :].mask]*HECTO

            T_interp = T_interpolator(p)
            h2o_interp = h2o_interpolator(p)

            nd = physics.vmr2nd(
                data["ch4_profile_raw"][n, :].data[
                    ~data["ch4_profile_raw"][n, :].mask],#*PPM,
                T_interp, p)

            nd_e = physics.vmr2nd(
                data["ch4_profile_raw_e"][n, :].data[
                    ~data["ch4_profile_raw_e"][n, :].mask],#*PPM,
                T_interp, p)

            #z = physics.p2z_oversimplified(p)
            if data["p0"][n] > p[0]:
                # need to add dummy p[0] < p to satisfy p2z
                #
                # this is inaccurate, but has little effect on the
                # *relative* location of other levels (<1 m), so it is an
                # acceptable source of error
                dummy = True
                pp = numpy.r_[data["p0"][n], p]
                Tp = numpy.r_[287, T_interp]
                # although putting h2o-surface to 0 is an EXTREMELY
                # bad approximation, it /still/ doesn't matter for the
                # relative thicknesses higher up
                h2op = numpy.r_[h2o_interp[0], h2o_interp]
            else:
                dummy = False
                (pp, Tp, h2op) = (p, T_interp, h2o_interp)
            #z = physics.p2z_hydrostatic(p, T_interp, h2o_interp,
            z = physics.p2z_hydrostatic(pp, Tp, h2op*PPM,
                p0=data["p0"][n],
                z0=data["z0"][n],
                lat=data["lat"][n])[dummy:]

            ncd[n] = pamath.integrate_with_height(z, nd)
            ncd_e[n] = pamath.integrate_with_height(z, nd_e)

        return (ncd, ncd_e)


    # implementation of abstract methods

    def _read(self, path=None, fields="all"):
        """Read Tanso FTS granule.  Currently hardcoded for CH4 raw&interp.
        
        """
        if path is None:
            path = self.srcfile

        with h5py.File(path, 'r') as h5f:
            D = collections.OrderedDict()
            D.update(self._read_common(h5f))
            p = h5f["Data"]["interpolatedProfile"]["pressure"][:]
            p *= HECTO # Pa -> hPa
            if self.p_for_interp_profile is not None:
                if not (self.p_for_interp_profile == p).all():
                    raise ValueError("Found inconsistent pressure"
                            " profiles!")
            else:
                self.p_for_interp_profile = p
            #D["p"] = h5f["Data"]["originalProfile"]["pressure"]
            D["ch4_profile_interp"] = (h5f["Data"]["interpolatedProfile"]
                ["CH4Profile"][:]*PPM)
            D["ch4_profile_raw"] = (h5f["Data"]["originalProfile"]
                ["CH4Profile"][:]*PPM)
            D["ch4_profile_raw_e"] = (h5f["Data"]["originalProfile"]
                ["CH4ProfileError"][:]*PPM)
            D["p_raw"] = h5f["Data"]["originalProfile"]["pressure"]

            A = numpy.empty(shape=time_raw.size,
                dtype=[(k, D[k].dtype, D[k].shape[1:]) for k in D.keys()])
            for k in D.keys():
                A[k] = D[k][:]
            # Unfortunately, masked arrays are buggy
#            A = A.view(numpy.ma.MaskedArray)
#            for k in {"ch4_profile_interp", "ch4_profile_raw",
#                      "ch4_profile_raw_e", "p_raw", "T"}:
#                A.mask[k][A.data[k]<0] = True

        return A if fields=="all" else A[fields]

    def get_z(self, meas): 
[docs]        try:
            return super().get_z(meas)
        except IndexError:
            pass # parent failed, continue here
#            if data["T"].mask.size == 1:
#                T = data["T"][n, ::-1].data
#            else:
#                T = data["T"][n, ::-1].data[~data["T"][n, ::-1].mask]
#
#            if data["h2o"].mask.size == 1:
#                h2o = data["h2o"][n, ::-1].data
#            else:
#                h2o = data["h2o"][n, ::-1].data[~data["h2o"][n, ::-1].mask]


        # Can't use masked arrays due to bug:
        # https://github.com/numpy/numpy/issues/2972
        # Use workaround instead.
        if isinstance(meas, numpy.ma.core.MaskedArray):
            meas = meas.data

        valid = meas["p_raw"] > 0

        T_interpolator = scipy.interpolate.interp1d(
            self.p_for_T_profile[::-1],
            meas["T"][::-1])# should be increasing

        h2o_interpolator = scipy.interpolate.interp1d(
            self.p_for_T_profile[::-1],
            meas["h2o"][::-1])
        
#            p = data["p_raw"][n, :].data[~data["p_raw"][n, :].mask]*HECTO
        p = meas["p_raw"] * HECTO
        p = p[valid]

        T_interp = T_interpolator(p)
        h2o_interp = h2o_interpolator(p)

#            nd = physics.vmr2nd(
#                data["ch4_profile_raw"][n, :].data[
#                    ~data["ch4_profile_raw"][n, :].mask]*PPM,
#                T_interp, p)
#
#            nd_e = physics.vmr2nd(
#                data["ch4_profile_raw_e"][n, :].data[
#                    ~data["ch4_profile_raw_e"][n, :].mask]*PPM,
#                T_interp, p)

        #z = physics.p2z_oversimplified(p)
        if meas["p0"] > p[0]:
            # need to add dummy p[0] < p to satisfy p2z
            #
            # this is inaccurate, but has little effect on the
            # *relative* location of other levels (<1 m), so it is an
            # acceptable source of error
            dummy = True
            pp = numpy.hstack((meas["p0"], p))
            Tp = numpy.hstack((287, T_interp))
            # although putting h2o-surface to a low value is an EXTREMELY
            # bad approximation, it /still/ doesn't matter for the
            # relative thicknesses higher up
            h2op = numpy.hstack((h2o_interp[0], h2o_interp))
        else:
            dummy = False
            (pp, Tp, h2op) = (p, T_interp, h2o_interp)
        #z = physics.p2z_hydrostatic(p, T_interp, h2o_interp,
        z = physics.p2z_hydrostatic(pp, Tp, h2op*PPM,
            p0=meas["p0"],
            z0=meas["z0"],
            lat=meas["lat"])[dummy:]
        return z