Source code for ctmatching.dataset

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
``ctmatching`` algoritm example data set loader. For tests and demo uses. 

About re78 data:

- 1978 US people earning data by race, age, gender, educations.
- 429 control samples, 185 treatment samples. Each sample has 10 properties.
  except ID

Full description of this data: http://users.nber.org/~rdehejia/data/nswdata2.html.
If this link is not available, try this:
https://github.com/MacHu-GWU/ctmatching-project/blob/master/ctmatching/testdata/re78-readme.html
"""

import numpy as np
import site
import os


[docs]def load_re78():
    """re78 dataset loader.

    Usage::

        >>> from ctmatching import load_re78
        >>> control, treat = load_re78()
        >>> len(control)
        429
        >>> len(treat)
        185
    """
    abspath = os.path.join(
        site.getsitepackages()[1], "ctmatching", "testdata", "re78.txt")

    with open(abspath, "rb") as f:
        lines = f.read().decode("utf-8").split("\n")

    columns = lines[0].strip().split(",")

    control = list()
    treatment = list()

    for line in lines[1:]:
        record = line.strip().split(",")
        
        for i in [1, 2, 3, 4, 5, 6, 7]:
            record[i] = int(record[i])
        for i in [8, 9, 10]:
            record[i] = float(record[i])

        if record[1]:
            treatment.append(record)
        else:
            control.append(record)

    return control, treatment


if __name__ == "__main__":
    control, treatment = load_re78()
    assert len(control) == 429
    assert len(treatment) == 185