New earnings release post

This commit is contained in:
Bradlee Speice
2016-06-09 19:27:36 -04:00
parent 0ba3723cd2
commit f9e8c08491
26 changed files with 17371 additions and 1256 deletions

View File

@ -103,18 +103,18 @@
<div class="prompt input_prompt">In&nbsp;[1]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<div class=" highlight hl-ipython3"><pre><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span>
<span class="o">%</span><span class="k">matplotlib</span> inline
<span class="c1"># Record how long it takes to run the notebook - I&#39;m curious.</span>
<span class="c"># Record how long it takes to run the notebook - I&#39;m curious.</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="k">import</span> <span class="n">datetime</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;split_train.csv&#39;</span><span class="p">)</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s">&#39;split_train.csv&#39;</span><span class="p">)</span>
<span class="n">dataset</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="n">dataset</span><span class="o">.</span><span class="n">ID</span>
<span class="n">X</span> <span class="o">=</span> <span class="n">dataset</span><span class="o">.</span><span class="n">drop</span><span class="p">([</span><span class="s1">&#39;TARGET&#39;</span><span class="p">,</span> <span class="s1">&#39;ID&#39;</span><span class="p">,</span> <span class="s1">&#39;ID.1&#39;</span><span class="p">],</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">X</span> <span class="o">=</span> <span class="n">dataset</span><span class="o">.</span><span class="n">drop</span><span class="p">([</span><span class="s">&#39;TARGET&#39;</span><span class="p">,</span> <span class="s">&#39;ID&#39;</span><span class="p">,</span> <span class="s">&#39;ID.1&#39;</span><span class="p">],</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">y</span> <span class="o">=</span> <span class="n">dataset</span><span class="o">.</span><span class="n">TARGET</span>
</pre></div>
@ -128,7 +128,7 @@
<div class="prompt input_prompt">In&nbsp;[2]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">y</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
<div class=" highlight hl-ipython3"><pre><span class="n">y</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</pre></div>
</div>
@ -157,7 +157,7 @@
<div class="prompt input_prompt">In&nbsp;[3]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="nb">len</span><span class="p">(</span><span class="n">X</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<div class=" highlight hl-ipython3"><pre><span class="nb">len</span><span class="p">(</span><span class="n">X</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
</pre></div>
</div>
@ -199,7 +199,7 @@
<div class="prompt input_prompt">In&nbsp;[4]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">cols</span> <span class="o">=</span> <span class="n">X</span><span class="o">.</span><span class="n">columns</span>
<div class=" highlight hl-ipython3"><pre><span class="n">cols</span> <span class="o">=</span> <span class="n">X</span><span class="o">.</span><span class="n">columns</span>
<span class="n">b_class</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">X</span><span class="p">[</span><span class="n">c</span><span class="p">]</span><span class="o">.</span><span class="n">unique</span><span class="p">())</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span>
@ -244,13 +244,13 @@
<div class="prompt input_prompt">In&nbsp;[5]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># First we need to `binarize` the data to 0-1; some of the labels are {0, 1},</span>
<span class="c1"># some are {0, 3}, etc.</span>
<div class=" highlight hl-ipython3"><pre><span class="c"># First we need to `binarize` the data to 0-1; some of the labels are {0, 1},</span>
<span class="c"># some are {0, 3}, etc.</span>
<span class="kn">from</span> <span class="nn">sklearn.preprocessing</span> <span class="k">import</span> <span class="n">binarize</span>
<span class="n">X_bin</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">X</span><span class="p">[</span><span class="n">b_class</span><span class="p">])</span>
<span class="n">accuracy</span> <span class="o">=</span> <span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">X_bin</span><span class="p">[:,</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">y</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">b_class</span><span class="p">))]</span>
<span class="n">acc_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;Accuracy&quot;</span><span class="p">:</span> <span class="n">accuracy</span><span class="p">},</span> <span class="n">index</span><span class="o">=</span><span class="n">b_class</span><span class="p">)</span>
<span class="n">acc_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s">&quot;Accuracy&quot;</span><span class="p">:</span> <span class="n">accuracy</span><span class="p">},</span> <span class="n">index</span><span class="o">=</span><span class="n">b_class</span><span class="p">)</span>
<span class="n">acc_df</span><span class="o">.</span><span class="n">describe</span><span class="p">()</span>
</pre></div>
@ -333,10 +333,10 @@
<div class="prompt input_prompt">In&nbsp;[6]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">unsat</span> <span class="o">=</span> <span class="n">y</span><span class="p">[</span><span class="n">y</span> <span class="o">==</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Satisfied customers: </span><span class="si">{}</span><span class="s2">; Unsatisfied customers: </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">y</span><span class="p">)</span> <span class="o">-</span> <span class="n">unsat</span><span class="p">,</span> <span class="n">unsat</span><span class="p">))</span>
<div class=" highlight hl-ipython3"><pre><span class="n">unsat</span> <span class="o">=</span> <span class="n">y</span><span class="p">[</span><span class="n">y</span> <span class="o">==</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="s">&quot;Satisfied customers: {}; Unsatisfied customers: {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">y</span><span class="p">)</span> <span class="o">-</span> <span class="n">unsat</span><span class="p">,</span> <span class="n">unsat</span><span class="p">))</span>
<span class="n">naive_guess</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">y</span> <span class="o">==</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">y</span><span class="p">)))</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Naive guess accuracy: </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">naive_guess</span><span class="p">))</span>
<span class="nb">print</span><span class="p">(</span><span class="s">&quot;Naive guess accuracy: {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">naive_guess</span><span class="p">))</span>
</pre></div>
</div>
@ -383,7 +383,7 @@ Naive guess accuracy: 0.9608561656706882
<div class="prompt input_prompt">In&nbsp;[7]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="kn">from</span> <span class="nn">sklearn.naive_bayes</span> <span class="k">import</span> <span class="n">GaussianNB</span>
<div class=" highlight hl-ipython3"><pre><span class="kn">from</span> <span class="nn">sklearn.naive_bayes</span> <span class="k">import</span> <span class="n">GaussianNB</span>
<span class="kn">from</span> <span class="nn">sklearn.decomposition</span> <span class="k">import</span> <span class="n">PCA</span>
<span class="n">X_no_bin</span> <span class="o">=</span> <span class="n">X</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">b_class</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
@ -397,9 +397,9 @@ Naive guess accuracy: 0.9608561656706882
<span class="k">return</span> <span class="n">gnb</span><span class="o">.</span><span class="n">score</span><span class="p">(</span><span class="n">X_xform</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span>
<span class="n">dim_range</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">21</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">dim_range</span><span class="p">,</span> <span class="p">[</span><span class="n">evaluate_gnb</span><span class="p">(</span><span class="n">dim</span><span class="p">)</span> <span class="k">for</span> <span class="n">dim</span> <span class="ow">in</span> <span class="n">dim_range</span><span class="p">],</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Gaussian NB Accuracy&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="n">naive_guess</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Naive Guess&quot;</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">naive_guess</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Inverse Naive Guess&quot;</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">dim_range</span><span class="p">,</span> <span class="p">[</span><span class="n">evaluate_gnb</span><span class="p">(</span><span class="n">dim</span><span class="p">)</span> <span class="k">for</span> <span class="n">dim</span> <span class="ow">in</span> <span class="n">dim_range</span><span class="p">],</span> <span class="n">label</span><span class="o">=</span><span class="s">&quot;Gaussian NB Accuracy&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="n">naive_guess</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">&quot;Naive Guess&quot;</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">naive_guess</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">&quot;Inverse Naive Guess&quot;</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">gcf</span><span class="p">()</span><span class="o">.</span><span class="n">set_size_inches</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">();</span>
</pre></div>
@ -674,7 +674,7 @@ rkJggg==
<div class="prompt input_prompt">In&nbsp;[8]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="k">def</span> <span class="nf">evaluate_gnb_full</span><span class="p">(</span><span class="n">dims</span><span class="p">):</span>
<div class=" highlight hl-ipython3"><pre><span class="k">def</span> <span class="nf">evaluate_gnb_full</span><span class="p">(</span><span class="n">dims</span><span class="p">):</span>
<span class="n">pca</span> <span class="o">=</span> <span class="n">PCA</span><span class="p">(</span><span class="n">n_components</span><span class="o">=</span><span class="n">dims</span><span class="p">)</span>
<span class="n">X_xform</span> <span class="o">=</span> <span class="n">pca</span><span class="o">.</span><span class="n">fit_transform</span><span class="p">(</span><span class="n">X</span><span class="p">)</span>
@ -683,9 +683,9 @@ rkJggg==
<span class="k">return</span> <span class="n">gnb</span><span class="o">.</span><span class="n">score</span><span class="p">(</span><span class="n">X_xform</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span>
<span class="n">dim_range</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">21</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">dim_range</span><span class="p">,</span> <span class="p">[</span><span class="n">evaluate_gnb</span><span class="p">(</span><span class="n">dim</span><span class="p">)</span> <span class="k">for</span> <span class="n">dim</span> <span class="ow">in</span> <span class="n">dim_range</span><span class="p">],</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Gaussian NB Accuracy&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="n">naive_guess</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Naive Guess&quot;</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">naive_guess</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s2">&quot;Inverse Naive Guess&quot;</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s1">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">dim_range</span><span class="p">,</span> <span class="p">[</span><span class="n">evaluate_gnb</span><span class="p">(</span><span class="n">dim</span><span class="p">)</span> <span class="k">for</span> <span class="n">dim</span> <span class="ow">in</span> <span class="n">dim_range</span><span class="p">],</span> <span class="n">label</span><span class="o">=</span><span class="s">&quot;Gaussian NB Accuracy&quot;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="n">naive_guess</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">&quot;Naive Guess&quot;</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">naive_guess</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">&quot;Inverse Naive Guess&quot;</span><span class="p">,</span> <span class="n">c</span><span class="o">=</span><span class="s">&#39;k&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">gcf</span><span class="p">()</span><span class="o">.</span><span class="n">set_size_inches</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">();</span>
</pre></div>
@ -960,8 +960,8 @@ rkJggg==
<div class="prompt input_prompt">In&nbsp;[9]:</div>
<div class="inner_cell">
<div class="input_area">
<div class=" highlight hl-ipython3"><pre><span></span><span class="n">end</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Running time: </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">end</span> <span class="o">-</span> <span class="n">start</span><span class="p">))</span>
<div class=" highlight hl-ipython3"><pre><span class="n">end</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="s">&quot;Running time: {}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">end</span> <span class="o">-</span> <span class="n">start</span><span class="p">))</span>
</pre></div>
</div>
@ -989,15 +989,15 @@ rkJggg==
<div class="inner_cell">
<div class="text_cell_render border-box-sizing rendered_html">
<h1 id="Appendix">Appendix<a class="anchor-link" href="#Appendix">&#182;</a></h1><p>Code used to split the initial training data:</p>
<div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">sklearn.cross_validation</span> <span class="kn">import</span> <span class="n">train_test_split</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;train.csv&#39;</span><span class="p">)</span>
<div class="highlight"><pre><span class="kn">from</span> <span class="nn">sklearn.cross_validation</span> <span class="kn">import</span> <span class="n">train_test_split</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s">&#39;train.csv&#39;</span><span class="p">)</span>
<span class="n">data</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">ID</span>
<span class="n">data_train</span><span class="p">,</span> <span class="n">data_validate</span> <span class="o">=</span> <span class="n">train_test_split</span><span class="p">(</span>
<span class="n">data</span><span class="p">,</span> <span class="n">train_size</span><span class="o">=.</span><span class="mi">7</span><span class="p">)</span>
<span class="n">data_train</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">&#39;split_train.csv&#39;</span><span class="p">)</span>
<span class="n">data_validate</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">&#39;split_validate.csv&#39;</span><span class="p">)</span>
<span class="n">data_train</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s">&#39;split_train.csv&#39;</span><span class="p">)</span>
<span class="n">data_validate</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s">&#39;split_validate.csv&#39;</span><span class="p">)</span>
</pre></div>
</div>