swh:1:snp:b1bc5cf54dab7987e9f85b6673e79b3b1d27ac15
Raw File
Tip revision: 1921801ba8c39c62c7e5db14ed499d9b35d63bb4 authored by Arun Maiya on 02 August 2022, 15:41:20 UTC
minor edits
Tip revision: 1921801
examples.html
---

title: Examples


keywords: fastai
sidebar: home_sidebar

summary: "Various examples of CausalNLP on semi-simulated or real datasets."
description: "Various examples of CausalNLP on semi-simulated or real datasets."
nb_path: "nbs/99_examples.ipynb"
---
<!--

#################################################
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
#################################################
# file to edit: nbs/99_examples.ipynb
# command to build the docs after a change: nbdev_build_docs

-->

<div class="container" id="notebook-container">
        
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">

</div>
    {% endraw %}

    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">from</span> <span class="nn">causalnlp</span> <span class="kn">import</span> <span class="n">CausalInferenceModel</span>
<span class="kn">from</span> <span class="nn">causalnlp</span> <span class="kn">import</span> <span class="n">Autocoder</span>
</pre></div>

    </div>
</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h2 id="What-is-the-causal-impact-of-a-positive-review-on-product-views?">What is the causal impact of a positive review on product views?<a class="anchor-link" href="#What-is-the-causal-impact-of-a-positive-review-on-product-views?"> </a></h2><p>We use a semi-simulated dataset generated from <a href="https://github.com/rpryzant/causal-text">this repo</a>, which is available in the <code>sample_data</code> folder. The reviews and product types are real, while the outcomes (e.g., 1=product clicked, 0=not clicked) are simulated.</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;sample_data/music_seed50.tsv&#39;</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">&#39;</span><span class="se">\t</span><span class="s1">&#39;</span><span class="p">,</span> <span class="n">error_bad_lines</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
</pre></div>

    </div>
</div>
</div>

</div>
    {% endraw %}

    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_html rendered_html output_subarea output_execute_result"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>index</th>
      <th>id</th>
      <th>rating</th>
      <th>product</th>
      <th>text</th>
      <th>summary</th>
      <th>price</th>
      <th>T_true</th>
      <th>C_true</th>
      <th>Y_sim</th>
      <th>negative</th>
      <th>positive</th>
      <th>T_ac</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>7</td>
      <td>0001388703</td>
      <td>1.0</td>
      <td>mp3 music</td>
      <td>buy the cd.  do not buy the mp3 album.  downlo...</td>
      <td>Buy the CD.  Do not buy the MP3.</td>
      <td>13.01</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0.548733</td>
      <td>0.451267</td>
      <td>0</td>
    </tr>
    <tr>
      <th>1</th>
      <td>8</td>
      <td>0001388703</td>
      <td>5.0</td>
      <td>mp3 music</td>
      <td>takes me back to my childhood!</td>
      <td>Love it!</td>
      <td>13.01</td>
      <td>1</td>
      <td>0</td>
      <td>0</td>
      <td>0.008373</td>
      <td>0.991627</td>
      <td>1</td>
    </tr>
    <tr>
      <th>2</th>
      <td>12</td>
      <td>0001388703</td>
      <td>5.0</td>
      <td>audio cd</td>
      <td>the passion and ingenuity of green's music is ...</td>
      <td>No one like Keith Green</td>
      <td>13.01</td>
      <td>1</td>
      <td>1</td>
      <td>1</td>
      <td>0.043761</td>
      <td>0.956239</td>
      <td>1</td>
    </tr>
    <tr>
      <th>3</th>
      <td>13</td>
      <td>0001388703</td>
      <td>5.0</td>
      <td>mp3 music</td>
      <td>keith's music is a timeless message.  since hi...</td>
      <td>Never Gets Old</td>
      <td>13.01</td>
      <td>1</td>
      <td>0</td>
      <td>1</td>
      <td>0.038876</td>
      <td>0.961124</td>
      <td>1</td>
    </tr>
    <tr>
      <th>4</th>
      <td>15</td>
      <td>0001377647</td>
      <td>5.0</td>
      <td>audio cd</td>
      <td>i have fallen in love with john michael talbot...</td>
      <td>Talbot a masterpiece</td>
      <td>18.99</td>
      <td>1</td>
      <td>1</td>
      <td>1</td>
      <td>0.019828</td>
      <td>0.980172</td>
      <td>1</td>
    </tr>
  </tbody>
</table>
</div></div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p><code>Y_sim</code> is the simulated outcome indicating whether or not the product was clicked.  <code>C_true</code> is a categorical variable, where 1 is an audio CD and and 0 is something else (e.g., MP3).  In this dataset, outcomes were simulated such that <code>C_true</code> is a counfounding variable for this problem.</p>

</div>
</div>
</div>
<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p>The treatment is whether or not the review is positive, which affects <code>Y_sim</code>.  Let's pretend we don't have a rating and need to infer this from text using the <a href="/causalnlp/autocoder.html#Autocoder"><code>Autocoder</code></a>. This can be done with:</p>
<div class="highlight"><pre><span></span><span class="n">ac</span> <span class="o">=</span> <span class="n">Autocoder</span><span class="p">()</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ac</span><span class="o">.</span><span class="n">code_sentiment</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;text&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">values</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">batch_size</span><span class="o">=</span><span class="mi">16</span><span class="p">,</span> <span class="n">binarize</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">df</span><span class="p">[</span><span class="s1">&#39;T_ac&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;positive&#39;</span><span class="p">]</span>
</pre></div>
<p>We've already created this as the <code>T_ac</code> column (along with the <code>positive</code> and <code>negative</code> columns), so invoking the above is not needed. Note that <code>T_ac</code> is an imperfect approximation of <code>T_true</code>. In CausalNLP, we can include the raw text as covariates to improve our estimates.</p>
<p>Let's fit the causal inference model.  We will adjust for both <code>C_true</code> and the raw text of the review to minimize bias from confounding. CausalNLP supports the following metalearners: S-Learner, T-Learner, X-Learner, and R-Learner. See <a href="https://arxiv.org/abs/1706.03461">this paper</a> for more information on these. We will use the T-Learner as the metalearner here. By default, T-Learners use LightGBM classifiers with 31 leaves.  Let's increase the number of leaves to 500. In practice, you can supply a learner with hyperparameters that you've tuned beforehand to accurately predict the outcome.</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">from</span> <span class="nn">lightgbm</span> <span class="kn">import</span> <span class="n">LGBMClassifier</span>
<span class="kn">from</span> <span class="nn">sklearn.linear_model</span> <span class="kn">import</span> <span class="n">LogisticRegression</span><span class="p">,</span> <span class="n">LinearRegression</span>
<span class="n">cm</span> <span class="o">=</span> <span class="n">CausalInferenceModel</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s1">&#39;t-learner&#39;</span><span class="p">,</span>
                    <span class="n">learner</span><span class="o">=</span><span class="n">LGBMClassifier</span><span class="p">(</span><span class="n">num_leaves</span><span class="o">=</span><span class="mi">500</span><span class="p">),</span>
                    <span class="n">treatment_col</span><span class="o">=</span><span class="s1">&#39;T_ac&#39;</span><span class="p">,</span> 
                    <span class="n">outcome_col</span><span class="o">=</span><span class="s1">&#39;Y_sim&#39;</span><span class="p">,</span> 
                    <span class="n">text_col</span><span class="o">=</span><span class="s1">&#39;text&#39;</span><span class="p">,</span>
                    <span class="n">include_cols</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;C_true&#39;</span><span class="p">])</span>
<span class="n">cm</span><span class="o">.</span><span class="n">fit</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">

<div class="output_subarea output_stream output_stdout output_text">
<pre>outcome column (categorical): Y_sim
treatment column: T_ac
numerical/categorical covariates: [&#39;C_true&#39;]
text covariate: text
preprocess time:  1.118110179901123  sec
start fitting causal inference model
time to fit causal inference model:  10.667636632919312  sec
</pre>
</div>
</div>

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>&lt;causalnlp.causalinference.CausalInferenceModel at 0x7f079361b0f0&gt;</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h3 id="Average-Treatment-Effect-(ATE)">Average Treatment Effect (ATE)<a class="anchor-link" href="#Average-Treatment-Effect-(ATE)"> </a></h3><p>We can calculate the overall average treatment effect (ATE) as follows:</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">estimate_ate</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>{&#39;ate&#39;: 0.1309311542209525}</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p>The overall ATE is an increase of 13 percentage points in probability.</p>
<p>Unlike machine learning, there is no ground truth to which our estimate can be compared for causal inference on real-world datasets. Hoewver, since this is a simulated dataset, we can compare our estimate with the ground truth ATE of <code>0.1479</code> (14.79 percentage point change in outcome), and our estimate is close.</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="k">def</span> <span class="nf">ATE_adjusted</span><span class="p">(</span><span class="n">C</span><span class="p">,</span> <span class="n">T</span><span class="p">,</span> <span class="n">Y</span><span class="p">):</span>
    <span class="n">x</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">list</span><span class="p">)</span>
    <span class="k">for</span> <span class="n">c</span><span class="p">,</span> <span class="n">t</span><span class="p">,</span> <span class="n">y</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">C</span><span class="p">,</span> <span class="n">T</span><span class="p">,</span> <span class="n">Y</span><span class="p">):</span>
        <span class="n">x</span><span class="p">[</span><span class="n">c</span><span class="p">,</span> <span class="n">t</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">y</span><span class="p">)</span>

    <span class="n">C0_ATE</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span><span class="mi">1</span><span class="p">])</span> <span class="o">-</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span><span class="mi">0</span><span class="p">])</span>
    <span class="n">C1_ATE</span> <span class="o">=</span>  <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">])</span> <span class="o">-</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span><span class="mi">0</span><span class="p">])</span>
    <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">([</span><span class="n">C0_ATE</span><span class="p">,</span> <span class="n">C1_ATE</span><span class="p">])</span>
<span class="nb">print</span><span class="p">(</span><span class="n">ATE_adjusted</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">C_true</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">T_true</span><span class="p">,</span> <span class="n">df</span><span class="o">.</span><span class="n">Y_sim</span><span class="p">))</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">

<div class="output_subarea output_stream output_stdout output_text">
<pre>0.14785542719890196
</pre>
</div>
</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p>Such oracle estimates are not available for real-world datsets, as mentioned.  For real-world scenarios, we can, at least, evaluate the robustness of the ATE estimate to various data manipuations (i.e., sensitivity analysis or refutation).</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">evaluate_robustness</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_html rendered_html output_subarea output_execute_result"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Method</th>
      <th>ATE</th>
      <th>New ATE</th>
      <th>New ATE LB</th>
      <th>New ATE UB</th>
      <th>Distance from Desired (should be near 0)</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Placebo Treatment</td>
      <td>0.130931</td>
      <td>0.00477642</td>
      <td>-0.00452705</td>
      <td>0.0140799</td>
      <td>0.00477642</td>
    </tr>
    <tr>
      <th>0</th>
      <td>Random Cause</td>
      <td>0.130931</td>
      <td>0.131122</td>
      <td>0.122196</td>
      <td>0.140049</td>
      <td>0.000191267</td>
    </tr>
    <tr>
      <th>0</th>
      <td>Subset Data(sample size @0.8)</td>
      <td>0.130931</td>
      <td>0.129383</td>
      <td>0.117239</td>
      <td>0.141528</td>
      <td>-0.0015477</td>
    </tr>
    <tr>
      <th>0</th>
      <td>Random Replace</td>
      <td>0.130931</td>
      <td>0.130196</td>
      <td>0.121209</td>
      <td>0.139184</td>
      <td>-0.000734766</td>
    </tr>
  </tbody>
</table>
</div></div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p>Here, we see the distance from the desired value is near zero for each sensitivy analysis method , which is good.</p>

</div>
</div>
</div>
<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h3 id="Conditional-Average-Treatment-Effect-(CATE)">Conditional Average Treatment Effect (CATE)<a class="anchor-link" href="#Conditional-Average-Treatment-Effect-(CATE)"> </a></h3><p>We can also calculate the conditional average treatment effects (CATE). For instance, here is the treatment effect for those reviews that mention the word ``toddler.''</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">series</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;text&#39;</span><span class="p">]</span>
<span class="n">cm</span><span class="o">.</span><span class="n">estimate_ate</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;text&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="s1">&#39;toddler&#39;</span><span class="p">))</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>{&#39;ate&#39;: 0.15559234254638685}</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h3 id="Individualized-Treatment-Effect-(ITE)">Individualized Treatment Effect (ITE)<a class="anchor-link" href="#Individualized-Treatment-Effect-(ITE)"> </a></h3><p>We can easily predict the treatment effect for new or existing observations on a per-unit basis.  We just need to make sure the DataFrame supplied as input to <a href="/causalnlp/core.causalinference.html#CausalInferenceModel.predict"><code>CausalInferenceModel.predict</code></a> contains the right columns. This can easily be checked with <a href="/causalnlp/core.causalinference.html#CausalInferenceModel.get_required_columns"><code>CausalInferenceModel.get_required_columns</code></a>:</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">get_required_columns</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>[&#39;T_ac&#39;, &#39;C_true&#39;, &#39;text&#39;]</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">test_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
    <span class="s1">&#39;T_ac&#39;</span> <span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">],</span>
    <span class="s1">&#39;C_true&#39;</span> <span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">],</span>
    <span class="s1">&#39;text&#39;</span> <span class="p">:</span> <span class="p">[</span><span class="s1">&#39;I love the music of Zamfir and his pan flute.&#39;</span><span class="p">]</span>
      <span class="p">})</span>
</pre></div>

    </div>
</div>
</div>

</div>
    {% endraw %}

    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">test_df</span><span class="p">)</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>array([[0.40062776]])</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h3 id="Model-Interpetability">Model Interpetability<a class="anchor-link" href="#Model-Interpetability"> </a></h3><p>We can use the <code>interpret</code> method to identify the attributes most predictive of individualized treatment effects across observations.  Features begnning with <code>v_</code> are word (or vocabulary) features.  We see that words like "music", "cd", and "love" in addition to the categorical attribute <code>C_true</code> (the known confounder which is 1 for audio CDs) are most predictive of <em>individualized</em> causal effects.</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">interpret</span><span class="p">(</span><span class="n">plot</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s1">&#39;feature_importance&#39;</span><span class="p">)[</span><span class="mi">1</span><span class="p">][:</span><span class="mi">10</span><span class="p">]</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>v_music    0.079042
v_cd       0.066838
v_album    0.055168
v_like     0.040784
v_love     0.040635
C_true     0.039949
v_just     0.035671
v_song     0.035362
v_great    0.029918
v_heard    0.028373
dtype: float64</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">explain</span><span class="p">(</span><span class="n">test_df</span><span class="p">,</span> <span class="n">row_num</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_png output_subarea ">
<img src="
"
>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h2 id="What-is-the-causal-impact-of-having-a-PhD-on-making-over-$50K?">What is the causal impact of having a PhD on making over $50K?<a class="anchor-link" href="#What-is-the-causal-impact-of-having-a-PhD-on-making-over-$50K?"> </a></h2><blockquote><p>Text is Optional in CausalNLP</p>
</blockquote>
<p>Despite the "NLP" in the name, <strong>CausalNLP</strong> can be used for causal analyses on traditional tabular datasets with no text fields.</p>
<p>Note:This dataset is from the early to mid 1990s, and we are using it as a toy dataset for demonstration purposes only.</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;sample_data/adult-census.csv&#39;</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">strip</span><span class="p">())</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">applymap</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">x</span><span class="p">)</span> 
<span class="n">filter_set</span> <span class="o">=</span> <span class="s1">&#39;Doctorate&#39;</span>
<span class="n">df</span><span class="p">[</span><span class="s1">&#39;treatment&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;education&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">filter_set</span> <span class="k">else</span> <span class="mi">0</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_html rendered_html output_subarea output_execute_result"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>age</th>
      <th>workclass</th>
      <th>fnlwgt</th>
      <th>education</th>
      <th>education-num</th>
      <th>marital-status</th>
      <th>occupation</th>
      <th>relationship</th>
      <th>race</th>
      <th>sex</th>
      <th>capital-gain</th>
      <th>capital-loss</th>
      <th>hours-per-week</th>
      <th>native-country</th>
      <th>class</th>
      <th>treatment</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>25</td>
      <td>Private</td>
      <td>178478</td>
      <td>Bachelors</td>
      <td>13</td>
      <td>Never-married</td>
      <td>Tech-support</td>
      <td>Own-child</td>
      <td>White</td>
      <td>Female</td>
      <td>0</td>
      <td>0</td>
      <td>40</td>
      <td>United-States</td>
      <td>&lt;=50K</td>
      <td>0</td>
    </tr>
    <tr>
      <th>1</th>
      <td>23</td>
      <td>State-gov</td>
      <td>61743</td>
      <td>5th-6th</td>
      <td>3</td>
      <td>Never-married</td>
      <td>Transport-moving</td>
      <td>Not-in-family</td>
      <td>White</td>
      <td>Male</td>
      <td>0</td>
      <td>0</td>
      <td>35</td>
      <td>United-States</td>
      <td>&lt;=50K</td>
      <td>0</td>
    </tr>
    <tr>
      <th>2</th>
      <td>46</td>
      <td>Private</td>
      <td>376789</td>
      <td>HS-grad</td>
      <td>9</td>
      <td>Never-married</td>
      <td>Other-service</td>
      <td>Not-in-family</td>
      <td>White</td>
      <td>Male</td>
      <td>0</td>
      <td>0</td>
      <td>15</td>
      <td>United-States</td>
      <td>&lt;=50K</td>
      <td>0</td>
    </tr>
    <tr>
      <th>3</th>
      <td>55</td>
      <td>?</td>
      <td>200235</td>
      <td>HS-grad</td>
      <td>9</td>
      <td>Married-civ-spouse</td>
      <td>?</td>
      <td>Husband</td>
      <td>White</td>
      <td>Male</td>
      <td>0</td>
      <td>0</td>
      <td>50</td>
      <td>United-States</td>
      <td>&gt;50K</td>
      <td>0</td>
    </tr>
    <tr>
      <th>4</th>
      <td>36</td>
      <td>Private</td>
      <td>224541</td>
      <td>7th-8th</td>
      <td>4</td>
      <td>Married-civ-spouse</td>
      <td>Handlers-cleaners</td>
      <td>Husband</td>
      <td>White</td>
      <td>Male</td>
      <td>0</td>
      <td>0</td>
      <td>40</td>
      <td>El-Salvador</td>
      <td>&lt;=50K</td>
      <td>0</td>
    </tr>
  </tbody>
</table>
</div></div>

</div>

</div>
</div>

</div>
    {% endraw %}

    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">from</span> <span class="nn">causalnlp</span> <span class="kn">import</span> <span class="n">CausalInferenceModel</span>
<span class="n">cm</span> <span class="o">=</span> <span class="n">CausalInferenceModel</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s1">&#39;t-learner&#39;</span><span class="p">,</span>
                   <span class="n">treatment_col</span><span class="o">=</span><span class="s1">&#39;treatment&#39;</span><span class="p">,</span> 
                   <span class="n">outcome_col</span><span class="o">=</span><span class="s1">&#39;class&#39;</span><span class="p">,</span>
                   <span class="n">ignore_cols</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;fnlwgt&#39;</span><span class="p">,</span> <span class="s1">&#39;education&#39;</span><span class="p">,</span><span class="s1">&#39;education-num&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">fit</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">

<div class="output_subarea output_stream output_stdout output_text">
<pre>replaced [&#39;&lt;=50K&#39;, &#39;&gt;50K&#39;] in column &#34;class&#34; with [0, 1]
outcome column (categorical): class
treatment column: treatment
numerical/categorical covariates: [&#39;age&#39;, &#39;workclass&#39;, &#39;marital-status&#39;, &#39;occupation&#39;, &#39;relationship&#39;, &#39;race&#39;, &#39;sex&#39;, &#39;capital-gain&#39;, &#39;capital-loss&#39;, &#39;hours-per-week&#39;, &#39;native-country&#39;]
preprocess time:  0.4857158660888672  sec
start fitting causal inference model
time to fit causal inference model:  5.035430908203125  sec
</pre>
</div>
</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p>Overall, the average treatment effect of having a PhD is an increase of 20 percentage points in the probability of making over $50K (with respect to this model and dataset):</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">estimate_ate</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>{&#39;ate&#39;: 0.20340645077516034}</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p>For those who have a Master's degree:</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">estimate_ate</span><span class="p">(</span><span class="n">cm</span><span class="o">.</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;education&#39;</span><span class="p">]</span> <span class="o">==</span> <span class="s1">&#39;Masters&#39;</span><span class="p">)</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>{&#39;ate&#39;: 0.17672418257642838}</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p>For those who are  high school dropouts:</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cm</span><span class="o">.</span><span class="n">estimate_ate</span><span class="p">(</span><span class="n">cm</span><span class="o">.</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;education&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">isin</span><span class="p">([</span><span class="s1">&#39;Preschool&#39;</span><span class="p">,</span> <span class="s1">&#39;1st-4th&#39;</span><span class="p">,</span> <span class="s1">&#39;5th-6th&#39;</span><span class="p">,</span> <span class="s1">&#39;7th-8th&#39;</span><span class="p">,</span> <span class="s1">&#39;9th&#39;</span><span class="p">,</span> <span class="s1">&#39;10th&#39;</span><span class="p">,</span> <span class="s1">&#39;12th&#39;</span><span class="p">]))</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">



<div class="output_text output_subarea output_execute_result">
<pre>{&#39;ate&#39;: 0.2586697863578173}</pre>
</div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h2 id="What-is-the-causal-impact-of-a-job-training-program-on-earnings?">What is the causal impact of a job training program on earnings?<a class="anchor-link" href="#What-is-the-causal-impact-of-a-job-training-program-on-earnings?"> </a></h2><p>This is another example of causal inference on purely tabular data (no text).  Here, we will use the famous <a href="https://rdrr.io/cran/sbw/man/lalonde.html">LaLonde dataset</a> from a job training study.</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;sample_data/lalonde.csv&#39;</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">


<div class="output_html rendered_html output_subarea output_execute_result"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>ID</th>
      <th>treat</th>
      <th>age</th>
      <th>educ</th>
      <th>black</th>
      <th>hispan</th>
      <th>married</th>
      <th>nodegree</th>
      <th>re74</th>
      <th>re75</th>
      <th>re78</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>NSW1</td>
      <td>1</td>
      <td>37</td>
      <td>11</td>
      <td>1</td>
      <td>0</td>
      <td>1</td>
      <td>1</td>
      <td>0.0</td>
      <td>0.0</td>
      <td>9930.0460</td>
    </tr>
    <tr>
      <th>1</th>
      <td>NSW2</td>
      <td>1</td>
      <td>22</td>
      <td>9</td>
      <td>0</td>
      <td>1</td>
      <td>0</td>
      <td>1</td>
      <td>0.0</td>
      <td>0.0</td>
      <td>3595.8940</td>
    </tr>
    <tr>
      <th>2</th>
      <td>NSW3</td>
      <td>1</td>
      <td>30</td>
      <td>12</td>
      <td>1</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0.0</td>
      <td>0.0</td>
      <td>24909.4500</td>
    </tr>
    <tr>
      <th>3</th>
      <td>NSW4</td>
      <td>1</td>
      <td>27</td>
      <td>11</td>
      <td>1</td>
      <td>0</td>
      <td>0</td>
      <td>1</td>
      <td>0.0</td>
      <td>0.0</td>
      <td>7506.1460</td>
    </tr>
    <tr>
      <th>4</th>
      <td>NSW5</td>
      <td>1</td>
      <td>33</td>
      <td>8</td>
      <td>1</td>
      <td>0</td>
      <td>0</td>
      <td>1</td>
      <td>0.0</td>
      <td>0.0</td>
      <td>289.7899</td>
    </tr>
  </tbody>
</table>
</div></div>

</div>

</div>
</div>

</div>
    {% endraw %}

<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<p>Unlike other meta-learners that use LightGBM as a default, the S-Learner uses Linear Regression as the default base learner for regression problems, which is a model that is often  used for this dataset.  The ATE estimate is $1548, which indicates that the job training program had an overall positive effect.</p>

</div>
</div>
</div>
    {% raw %}
    
<div class="cell border-box-sizing code_cell rendered">
<div class="input">

<div class="inner_cell">
    <div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">from</span> <span class="nn">causalnlp</span> <span class="kn">import</span> <span class="n">CausalInferenceModel</span>
<span class="n">cm</span> <span class="o">=</span> <span class="n">CausalInferenceModel</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s1">&#39;s-learner&#39;</span><span class="p">,</span>
                   <span class="n">treatment_col</span><span class="o">=</span><span class="s1">&#39;treat&#39;</span><span class="p">,</span> 
                   <span class="n">outcome_col</span><span class="o">=</span><span class="s1">&#39;re78&#39;</span><span class="p">,</span>
                   <span class="n">include_cols</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;age&#39;</span><span class="p">,</span> <span class="s1">&#39;educ&#39;</span><span class="p">,</span> <span class="s1">&#39;black&#39;</span><span class="p">,</span> <span class="s1">&#39;hispan&#39;</span><span class="p">,</span> <span class="s1">&#39;married&#39;</span><span class="p">,</span> <span class="s1">&#39;nodegree&#39;</span><span class="p">,</span> <span class="s1">&#39;re74&#39;</span><span class="p">,</span> <span class="s1">&#39;re75&#39;</span><span class="p">])</span>
<span class="n">cm</span><span class="o">.</span><span class="n">fit</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="n">cm</span><span class="o">.</span><span class="n">estimate_ate</span><span class="p">())</span> <span class="c1"># ATE estimate = $1548</span>
</pre></div>

    </div>
</div>
</div>

<div class="output_wrapper">
<div class="output">

<div class="output_area">

<div class="output_subarea output_stream output_stdout output_text">
<pre>outcome column (numerical): re78
treatment column: treat
numerical/categorical covariates: [&#39;age&#39;, &#39;educ&#39;, &#39;black&#39;, &#39;hispan&#39;, &#39;married&#39;, &#39;nodegree&#39;, &#39;re74&#39;, &#39;re75&#39;]
preprocess time:  0.017691612243652344  sec
start fitting causal inference model
time to fit causal inference model:  0.0024728775024414062  sec
{&#39;ate&#39;: 1548.2438019996084}
</pre>
</div>
</div>

</div>
</div>

</div>
    {% endraw %}

</div>


back to top