speice.io/2016/03/predicting-santander-customer-happiness/index.html

48 lines
83 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html><html lang=en dir=ltr class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated=false><meta charset=UTF-8><meta name=generator content="Docusaurus v3.6.0"><title data-rh=true>Predicting Santander customer happiness | The Old Speice Guy</title><meta data-rh=true name=viewport content="width=device-width,initial-scale=1.0"><meta data-rh=true name=twitter:card content=summary_large_image><meta data-rh=true property=og:url content=https://speice.io/2016/03/predicting-santander-customer-happiness><meta data-rh=true property=og:locale content=en><meta data-rh=true name=docusaurus_locale content=en><meta data-rh=true name=docusaurus_tag content=default><meta data-rh=true name=docsearch:language content=en><meta data-rh=true name=docsearch:docusaurus_tag content=default><meta data-rh=true property=og:title content="Predicting Santander customer happiness | The Old Speice Guy"><meta data-rh=true name=description content="My first Kaggle competition."><meta data-rh=true property=og:description content="My first Kaggle competition."><meta data-rh=true property=og:type content=article><meta data-rh=true property=article:published_time content=2016-03-05T12:00:00.000Z><link data-rh=true rel=icon href=/img/favicon.ico><link data-rh=true rel=canonical href=https://speice.io/2016/03/predicting-santander-customer-happiness><link data-rh=true rel=alternate href=https://speice.io/2016/03/predicting-santander-customer-happiness hreflang=en><link data-rh=true rel=alternate href=https://speice.io/2016/03/predicting-santander-customer-happiness hreflang=x-default><script data-rh=true type=application/ld+json>{"@context":"https://schema.org","@id":"https://speice.io/2016/03/predicting-santander-customer-happiness","@type":"BlogPosting","author":{"@type":"Person","name":"Bradlee Speice"},"dateModified":"2024-11-06T03:32:56.000Z","datePublished":"2016-03-05T12:00:00.000Z","description":"My first Kaggle competition.","headline":"Predicting Santander customer happiness","isPartOf":{"@id":"https://speice.io/","@type":"Blog","name":"Blog"},"keywords":[],"mainEntityOfPage":"https://speice.io/2016/03/predicting-santander-customer-happiness","name":"Predicting Santander customer happiness","url":"https://speice.io/2016/03/predicting-santander-customer-happiness"}</script><link rel=alternate type=application/rss+xml href=/rss.xml title="The Old Speice Guy RSS Feed"><link rel=alternate type=application/atom+xml href=/atom.xml title="The Old Speice Guy Atom Feed"><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.13.24/dist/katex.min.css integrity=sha384-odtC+0UGzzFL/6PNoE8rX/SPcQDXBJ+uRepguP4QkPCm2LBxH3FA3y+fKSiJ+AmM crossorigin><link rel=stylesheet href=/assets/css/styles.ae6ff4a3.css><script src=/assets/js/runtime~main.751b419d.js defer></script><script src=/assets/js/main.62ce6156.js defer></script><body class=navigation-with-keyboard><script>!function(){var t,e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return window.localStorage.getItem("theme")}catch(t){}}();t=null!==e?e:"light",document.documentElement.setAttribute("data-theme",t)}(),function(){try{for(var[t,e]of new URLSearchParams(window.location.search).entries())if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id=__docusaurus><div role=region aria-label="Skip to main content"><a class=skipToContent_fXgn href=#__docusaurus_skipToContent_fallback>Skip to main content</a></div><nav aria-label=Main class="navbar navbar--fixed-top"><div class=navbar__inner><div class=navbar__items><button aria-label="Toggle navigation bar" aria-expanded=false class="navbar__toggle clean-btn" type=button><svg width=30 height=30 viewBox="0 0 30 30" aria-hidden=true><path stroke=currentColor stroke-linecap=round stroke-miterlimit=10 stroke-width=2 d="M4 7h22M4 15h22M4 23h22"/></svg></button><a class=navbar__brand href=/><div class=navbar__logo><img src=/img/logo.svg alt="Sierpinski Gasket" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src=/img/logo-dark.svg alt="Sierpinski Gasket" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div><b class="navbar__title text--truncate">The Old Speice Guy</b></a></div><div class="navbar__items navbar__items--right"><a href=https://github.com/bspeice target=_blank rel="noopener noreferrer" class="navbar__item navbar__link header-github-link"></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type=button disabled title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live=polite aria-pressed=false><svg viewBox="0 0 24 24" width=24 height=24 class=lightToggleIcon_pyhR><path fill=currentColor d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"/></svg><svg viewBox="0 0 24 24" width=24 height=24 class=darkToggleIcon_wfgR><path fill=currentColor d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"/></svg></button></div><div class=navbarSearchContainer_Bca1><div class=navbar__search><span aria-label="expand searchbar" role=button class=search-icon tabindex=0></span><input id=search_input_react type=search placeholder=Loading... aria-label=Search class="navbar__search-input search-bar" disabled></div></div></div></div><div role=presentation class=navbar-sidebar__backdrop></div></nav><div id=__docusaurus_skipToContent_fallback class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class=row><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">All posts</div><div role=group><h3>2022</h3><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2011/11/webpack-industrial-complex>The webpack industrial complex</a></ul></div><div role=group><h3>2019</h3><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/12/release-the-gil>Release the GIL</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/09/binary-format-shootout>Binary format shootout</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/06/high-performance-systems>On building high performance systems</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/05/making-bread>Making bread</a></ul><div role=group><h4>Allocations in Rust</h4><ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/02/understanding-allocations-in-rust>Foreword</a><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/02/the-whole-world>Global memory</a><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/02/stacking-up>Fixed memory</a><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/02/a-heaping-helping>Dynamic memory</a><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/02/08/compiler-optimizations>Compiler optimizations</a><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2019/02/summary>Summary</a></ul></ul></div></div><div role=group><h3>2018</h3><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/12/allocation-safety>QADAPT - debug_assert! for allocations</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/12/what-small-business-really-means>More "what companies really mean"</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/10/case-study-optimization>A case study in heaptrack</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/09/isomorphic-apps>Isomorphic desktop apps with Rust</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/09/primitives-in-rust-are-weird>Primitives in Rust are weird (and cool)</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/06/dateutil-parser-to-rust>What I learned porting dateutil to Rust</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/05/hello>Hello!</a></ul><div role=group><h4>Captain's Cookbook</h4><ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/01/captains-cookbook-part-1>Project setup</a><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2018/01/captains-cookbook-part-2>Practical usage</a></ul></ul></div></div><div role=group><h3>2016</h3><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/11/pca-audio-compression>PCA audio compression</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/10/rustic-repodcasting>A Rustic re-podcasting server</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/06/event-studies-and-earnings-releases>Event studies and earnings releases</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/05/the-unfair-casino>The unfair casino</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/04/tick-tock>Tick tock...</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/03/tweet-like-me>Tweet like me</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a aria-current=page class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href=/2016/03/predicting-santander-customer-happiness>Predicting Santander customer happiness</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/02/profitability-using-the-investment-formula>Profitability using the investment formula</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/02/guaranteed-money-maker>Guaranteed money maker</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/01/cloudy-in-seattle>Cloudy in Seattle</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2016/01/complaining-about-the-weather>Complaining about the weather</a></ul></div><div role=group><h3>2015</h3><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2015/12/testing-cramer>Testing Cramer</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2015/11/autocallable>Autocallable Bonds</a></ul><ul class="sidebarItemList_Yudw clean-list"><li class=sidebarItem__DBe><a class=sidebarItemLink_mo7H href=/2015/11/welcome>Welcome, and an algorithm</a></ul></div></nav></aside><main class="col col--7"><article><header><h1 class=title_f1Hy>Predicting Santander customer happiness</h1><div class="container_mt6G margin-vert--md"><time datetime=2016-03-05T12:00:00.000Z>March 5, 2016</time> · <!-- -->7 min read</div><div class="margin-top--md margin-bottom--sm row"><div class="col col--12 authorCol_Hf19"><div class="avatar margin-bottom--sm"><div class="avatar__intro authorDetails_lV9A"><div class=avatar__name><span class=authorName_yefp>Bradlee Speice</span></div><div class=authorSocials_rSDt><a href=https://github.com/bspeice target=_blank rel="noopener noreferrer" class=authorSocialLink_owbf title=GitHub><svg viewBox="0 0 256 250" width=1em height=1em class="authorSocialLink_owbf githubSvg_Uu4N" style=--dark:#000;--light:#fff preserveAspectRatio=xMidYMid><path d="M128.001 0C57.317 0 0 57.307 0 128.001c0 56.554 36.676 104.535 87.535 121.46 6.397 1.185 8.746-2.777 8.746-6.158 0-3.052-.12-13.135-.174-23.83-35.61 7.742-43.124-15.103-43.124-15.103-5.823-14.795-14.213-18.73-14.213-18.73-11.613-7.944.876-7.78.876-7.78 12.853.902 19.621 13.19 19.621 13.19 11.417 19.568 29.945 13.911 37.249 10.64 1.149-8.272 4.466-13.92 8.127-17.116-28.431-3.236-58.318-14.212-58.318-63.258 0-13.975 5-25.394 13.188-34.358-1.329-3.224-5.71-16.242 1.24-33.874 0 0 10.749-3.44 35.21 13.121 10.21-2.836 21.16-4.258 32.038-4.307 10.878.049 21.837 1.47 32.066 4.307 24.431-16.56 35.165-13.12 35.165-13.12 6.967 17.63 2.584 30.65 1.255 33.873 8.207 8.964 13.173 20.383 13.173 34.358 0 49.163-29.944 59.988-58.447 63.157 4.591 3.972 8.682 11.762 8.682 23.704 0 17.126-.148 30.91-.148 35.126 0 3.407 2.304 7.398 8.792 6.14C219.37 232.5 256 184.537 256 128.002 256 57.307 198.691 0 128.001 0Zm-80.06 182.34c-.282.636-1.283.827-2.194.39-.929-.417-1.45-1.284-1.15-1.922.276-.655 1.279-.838 2.205-.399.93.418 1.46 1.293 1.139 1.931Zm6.296 5.618c-.61.566-1.804.303-2.614-.591-.837-.892-.994-2.086-.375-2.66.63-.566 1.787-.301 2.626.591.838.903 1 2.088.363 2.66Zm4.32 7.188c-.785.545-2.067.034-2.86-1.104-.784-1.138-.784-2.503.017-3.05.795-.547 2.058-.055 2.861 1.075.782 1.157.782 2.522-.019 3.08Zm7.304 8.325c-.701.774-2.196.566-3.29-.49-1.119-1.032-1.43-2.496-.726-3.27.71-.776 2.213-.558 3.315.49 1.11 1.03 1.45 2.505.701 3.27Zm9.442 2.81c-.31 1.003-1.75 1.459-3.199 1.033-1.448-.439-2.395-1.613-2.103-2.626.301-1.01 1.747-1.484 3.207-1.028 1.446.436 2.396 1.602 2.095 2.622Zm10.744 1.193c.036 1.055-1.193 1.93-2.715 1.95-1.53.034-2.769-.82-2.786-1.86 0-1.065 1.202-1.932 2.733-1.958 1.522-.03 2.768.818 2.768 1.868Zm10.555-.405c.182 1.03-.875 2.088-2.387 2.37-1.485.271-2.861-.365-3.05-1.386-.184-1.056.893-2.114 2.376-2.387 1.514-.263 2.868.356 3.061 1.403Z"/></svg></a></div></div></div></div></div></header><div id=__blog-post-container class=markdown><p>My first Kaggle competition.</p>
<p>It's time! After embarking on a Machine Learning class this semester, and with a Saturday in which I don't have much planned, I wanted to put this class and training to work. It's my first competition submission. I want to walk you guys through how I'm approaching this problem, because I thought it would be really neat. The competition is Banco Santander's <a href=https://www.kaggle.com/c/santander-customer-satisfaction target=_blank rel="noopener noreferrer">Santander Customer Satisfaction</a> competition. It seemed like an easy enough problem I could actually make decent progress on it.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id=data-exploration>Data Exploration<a href=#data-exploration class=hash-link aria-label="Direct link to Data Exploration" title="Direct link to Data Exploration"></a></h2>
<p>First up: we need to load our data and do some exploratory work. Because we're going to be using this data for model selection prior to testing, we need to make a further split. I've already gone ahead and done this work, please see the code in the <a href=#appendix>appendix below</a>.</p>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token keyword" style="color:hsl(301, 63%, 40%)">import</span><span class="token plain"> pandas </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">as</span><span class="token plain"> pd</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">import</span><span class="token plain"> numpy </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">as</span><span class="token plain"> np</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">import</span><span class="token plain"> matplotlib</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">pyplot </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">as</span><span class="token plain"> plt</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token operator" style="color:hsl(221, 87%, 60%)">%</span><span class="token plain">matplotlib inline</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token comment" style="color:hsl(230, 4%, 64%)"># Record how long it takes to run the notebook - I'm curious.</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">from</span><span class="token plain"> datetime </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">import</span><span class="token plain"> datetime</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">start </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> datetime</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">now</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">dataset </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> pd</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">read_csv</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token string" style="color:hsl(119, 34%, 47%)">'split_train.csv'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">dataset</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">index </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> dataset</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">ID</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">X </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> dataset</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">drop</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token string" style="color:hsl(119, 34%, 47%)">'TARGET'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token string" style="color:hsl(119, 34%, 47%)">'ID'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token string" style="color:hsl(119, 34%, 47%)">'ID.1'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token number" style="color:hsl(35, 99%, 36%)">1</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">y </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> dataset</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">TARGET</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">unique</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-text codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> array([0, 1], dtype=int64)</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token builtin" style="color:hsl(119, 34%, 47%)">len</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">columns</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-text codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> 369</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<p>Okay, so there are only <a href=https://www.kaggle.com/c/santander-customer-satisfaction/data target=_blank rel="noopener noreferrer">two classes we're predicting</a>: 1 for unsatisfied customers, 0 for satisfied customers. I would have preferred this to be something more like a regression, or predicting multiple classes: maybe the customer isn't the most happy, but is nowhere near closing their accounts. For now though, that's just the data we're working with.</p>
<p>Now, I'd like to make a scatter matrix of everything going on. Unfortunately as noted above, we have 369 different features. There's no way I can graphically make sense of that much data to start with.</p>
<p>We're also not told what the data actually represents: Are these survey results? Average time between contact with a customer care person? Frequency of contacting a customer care person? The idea is that I need to reduce the number of dimensions we're predicting across.</p>
<h3 class="anchor anchorWithStickyNavbar_LWe7" id=dimensionality-reduction-pt-1---binary-classifiers>Dimensionality Reduction pt. 1 - Binary Classifiers<a href=#dimensionality-reduction-pt-1---binary-classifiers class=hash-link aria-label="Direct link to Dimensionality Reduction pt. 1 - Binary Classifiers" title="Direct link to Dimensionality Reduction pt. 1 - Binary Classifiers"></a></h3>
<p>My first attempt to reduce the data dimensionality is to find all the binary classifiers in the dataset (i.e. 0 or 1 values) and see if any of those are good (or anti-good) predictors of the final data.</p>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">cols </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> X</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">columns</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">b_class </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> </span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">for</span><span class="token plain"> c </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">in</span><span class="token plain"> cols</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">:</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">if</span><span class="token plain"> </span><span class="token builtin" style="color:hsl(119, 34%, 47%)">len</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token plain">c</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">unique</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"> </span><span class="token operator" style="color:hsl(221, 87%, 60%)">==</span><span class="token plain"> </span><span class="token number" style="color:hsl(35, 99%, 36%)">2</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">:</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> b_class</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">append</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">c</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> </span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token builtin" style="color:hsl(119, 34%, 47%)">len</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">b_class</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-text codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> 111</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<p>So there are 111 features in the dataset that are a binary label. Let's see if any of them are good at predicting the users satisfaction!</p>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token comment" style="color:hsl(230, 4%, 64%)"># First we need to `binarize` the data to 0-1; some of the labels are {0, 1},</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token comment" style="color:hsl(230, 4%, 64%)"># some are {0, 3}, etc.</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">from</span><span class="token plain"> sklearn</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">preprocessing </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">import</span><span class="token plain"> binarize</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">X_bin </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> binarize</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token plain">b_class</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">accuracy </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> </span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token plain">np</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">mean</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X_bin</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">:</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain">i</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token plain"> </span><span class="token operator" style="color:hsl(221, 87%, 60%)">==</span><span class="token plain"> y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"> </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">for</span><span class="token plain"> i </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">in</span><span class="token plain"> </span><span class="token builtin" style="color:hsl(119, 34%, 47%)">range</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token number" style="color:hsl(35, 99%, 36%)">0</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token builtin" style="color:hsl(119, 34%, 47%)">len</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">b_class</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">acc_df </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> pd</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">DataFrame</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">{</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Accuracy"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">:</span><span class="token plain"> accuracy</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">}</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> index</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain">b_class</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">acc_df</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">describe</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<div><table><thead><tr><th><th>Accuracy<tbody><tr><th>count<td>111.000000<tr><th>mean<td>0.905159<tr><th>std<td>0.180602<tr><th>min<td>0.043598<tr><th>25%<td>0.937329<tr><th>50%<td>0.959372<tr><th>75%<td>0.960837<tr><th>max<td>0.960837</table></div>
<p>Wow! Looks like we've got some incredibly predictive features! So much so that we should be a bit concerned. My initial guess for what's happening is that we have a sparsity issue: so many of the values are 0, and these likely happen to line up with satisfied customers.</p>
<p>So the question we must now answer, which I likely should have asked long before now: What exactly is the distribution of un/satisfied customers?</p>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">unsat </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token plain">y </span><span class="token operator" style="color:hsl(221, 87%, 60%)">==</span><span class="token plain"> </span><span class="token number" style="color:hsl(35, 99%, 36%)">1</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">count</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">print</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Satisfied customers: {}; Unsatisfied customers: {}"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token builtin" style="color:hsl(119, 34%, 47%)">format</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token builtin" style="color:hsl(119, 34%, 47%)">len</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"> </span><span class="token operator" style="color:hsl(221, 87%, 60%)">-</span><span class="token plain"> unsat</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> unsat</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">naive_guess </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> np</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">mean</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">y </span><span class="token operator" style="color:hsl(221, 87%, 60%)">==</span><span class="token plain"> np</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">zeros</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token builtin" style="color:hsl(119, 34%, 47%)">len</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">print</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Naive guess accuracy: {}"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token builtin" style="color:hsl(119, 34%, 47%)">format</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">naive_guess</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-text codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> Satisfied customers: 51131; Unsatisfied customers: 2083</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> Naive guess accuracy: 0.9608561656706882</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<p>This is a bit discouraging. A naive guess of "always satisfied" performs as well as our best individual binary classifier. What this tells me then, is that these data columns aren't incredibly helpful in prediction. I'd be interested in a polynomial expansion of this data-set, but for now, that's more computation than I want to take on.</p>
<h3 class="anchor anchorWithStickyNavbar_LWe7" id=dimensionality-reduction-pt-2---lda>Dimensionality Reduction pt. 2 - LDA<a href=#dimensionality-reduction-pt-2---lda class=hash-link aria-label="Direct link to Dimensionality Reduction pt. 2 - LDA" title="Direct link to Dimensionality Reduction pt. 2 - LDA"></a></h3>
<p>Knowing that our naive guess performs so well is a blessing and a curse:</p>
<ul>
<li>Curse: The threshold for performance is incredibly high: We can only "improve" over the naive guess by 4%</li>
<li>Blessing: All the binary classification features we just discovered are worthless on their own. We can throw them out and reduce the data dimensionality from 369 to 111.</li>
</ul>
<p>Now, in removing these features from the dataset, I'm not saying that there is no "information" contained within them. There might be. But the only way we'd know is through a polynomial expansion, and I'm not going to take that on within this post.</p>
<p>My initial thought for a "next guess" is to use the <a href=http://scikit-learn.org/stable/modules/lda_qda.html target=_blank rel="noopener noreferrer">LDA</a> model for dimensionality reduction. However, it can only reduce dimensions to <span class=katex><span class=katex-mathml><math><semantics><mrow><mn>1</mn><mo></mo><mi>p</mi></mrow><annotation encoding=application/x-tex>1 - p</annotation></semantics></math></span><span class=katex-html aria-hidden=true><span class=base><span class=strut style=height:0.7278em;vertical-align:-0.0833em></span><span class=mord>1</span><span class=mspace style=margin-right:0.2222em></span><span class=mbin></span><span class=mspace style=margin-right:0.2222em></span></span><span class=base><span class=strut style=height:0.625em;vertical-align:-0.1944em></span><span class="mord mathnormal">p</span></span></span></span>, with <span class=katex><span class=katex-mathml><math><semantics><mrow><mi>p</mi></mrow><annotation encoding=application/x-tex>p</annotation></semantics></math></span><span class=katex-html aria-hidden=true><span class=base><span class=strut style=height:0.625em;vertical-align:-0.1944em></span><span class="mord mathnormal">p</span></span></span></span> being the number of classes. Since this is a binary classification, every LDA model that I try will have dimensionality one; when I actually try this, the predictor ends up being slightly less accurate than the naive guess.</p>
<p>Instead, let's take a different approach to dimensionality reduction: <a href=http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html target=_blank rel="noopener noreferrer">principle components analysis</a>. This allows us to perform the dimensionality reduction without worrying about the number of classes. Then, we'll use a <a href=http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes target=_blank rel="noopener noreferrer">Gaussian Naive Bayes</a> model to actually do the prediction. This model is chosen simply because it doesn't take a long time to fit and compute; because PCA will take so long, I just want a prediction at the end of this. We can worry about using a more sophisticated LDA/QDA/SVM model later.</p>
<p>Now into the actual process: We're going to test out PCA dimensionality reduction from 1 - 20 dimensions, and then predict using a Gaussian Naive Bayes model. The 20 dimensions upper limit was selected because the accuracy never improves after you get beyond that (I found out by running it myself). Hopefully, we'll find that we can create a model better than the naive guess.</p>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token keyword" style="color:hsl(301, 63%, 40%)">from</span><span class="token plain"> sklearn</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">naive_bayes </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">import</span><span class="token plain"> GaussianNB</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">from</span><span class="token plain"> sklearn</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">decomposition </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">import</span><span class="token plain"> PCA</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">X_no_bin </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> X</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">drop</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">b_class</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token number" style="color:hsl(35, 99%, 36%)">1</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">def</span><span class="token plain"> </span><span class="token function" style="color:hsl(221, 87%, 60%)">evaluate_gnb</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">dims</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">:</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> pca </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> PCA</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">n_components</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain">dims</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> X_xform </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> pca</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">fit_transform</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X_no_bin</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> </span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> gnb </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> GaussianNB</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> gnb</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">fit</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X_xform</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">return</span><span class="token plain"> gnb</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">score</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X_xform</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">dim_range </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> np</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">arange</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token number" style="color:hsl(35, 99%, 36%)">1</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token number" style="color:hsl(35, 99%, 36%)">21</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">plot</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">dim_range</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token plain">evaluate_gnb</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">dim</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"> </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">for</span><span class="token plain"> dim </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">in</span><span class="token plain"> dim_range</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> label</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Gaussian NB Accuracy"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">axhline</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">naive_guess</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> label</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Naive Guess"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> c</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">'k'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">axhline</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token number" style="color:hsl(35, 99%, 36%)">1</span><span class="token plain"> </span><span class="token operator" style="color:hsl(221, 87%, 60%)">-</span><span class="token plain"> naive_guess</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> label</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Inverse Naive Guess"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> c</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">'k'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">gcf</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">set_size_inches</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token number" style="color:hsl(35, 99%, 36%)">12</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token number" style="color:hsl(35, 99%, 36%)">6</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">legend</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">;</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<p><img decoding=async loading=lazy alt=png src=/assets/images/_notebook_11_0-2d0fe64b876b1c32a095f2d74b128f3c.png width=710 height=368 class=img_ev3q></p>
<p><strong>sigh...</strong> After all the effort and computational power, we're still at square one: we have yet to beat out the naive guess threshold. With PCA in play we end up performing terribly, but not terribly enough that we can guess against ourselves.</p>
<p>Let's try one last-ditch attempt using the entire data set:</p>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token keyword" style="color:hsl(301, 63%, 40%)">def</span><span class="token plain"> </span><span class="token function" style="color:hsl(221, 87%, 60%)">evaluate_gnb_full</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">dims</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">:</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> pca </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> PCA</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">n_components</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain">dims</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> X_xform </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> pca</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">fit_transform</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> </span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> gnb </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> GaussianNB</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> gnb</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">fit</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X_xform</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">return</span><span class="token plain"> gnb</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">score</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">X_xform</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> y</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">dim_range </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> np</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">arange</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token number" style="color:hsl(35, 99%, 36%)">1</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token number" style="color:hsl(35, 99%, 36%)">21</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">plot</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">dim_range</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">[</span><span class="token plain">evaluate_gnb</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">dim</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"> </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">for</span><span class="token plain"> dim </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">in</span><span class="token plain"> dim_range</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">]</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> label</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Gaussian NB Accuracy"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">axhline</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">naive_guess</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> label</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Naive Guess"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> c</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">'k'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">axhline</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token number" style="color:hsl(35, 99%, 36%)">1</span><span class="token plain"> </span><span class="token operator" style="color:hsl(221, 87%, 60%)">-</span><span class="token plain"> naive_guess</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> label</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Inverse Naive Guess"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> c</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token string" style="color:hsl(119, 34%, 47%)">'k'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">gcf</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">set_size_inches</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token number" style="color:hsl(35, 99%, 36%)">12</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> </span><span class="token number" style="color:hsl(35, 99%, 36%)">6</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">plt</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">legend</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">;</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<p><img decoding=async loading=lazy alt=png src=/assets/images/_notebook_13_0-2d0fe64b876b1c32a095f2d74b128f3c.png width=710 height=368 class=img_ev3q></p>
<p>Nothing. It is interesting to note that the graphs are almost exactly the same: This would imply again that the variables we removed earlier (all the binary classifiers) indeed have almost no predictive power. It seems this problem is high-dimensional, but with almost no data that can actually inform our decisions.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id=summary-for-day-1>Summary for Day 1<a href=#summary-for-day-1 class=hash-link aria-label="Direct link to Summary for Day 1" title="Direct link to Summary for Day 1"></a></h2>
<p>After spending a couple hours with this dataset, there seems to be a fundamental issue in play: We have very high-dimensional data, and it has no bearing on our ability to actually predict customer satisfaction. This can be a huge issue: it implies that <strong>no matter what model we use, we fundamentally can't perform well.</strong> I'm sure most of this is because I'm not an experienced data scientist. Even so, we have yet to develop a strategy that can actually beat out the village idiot; <strong>so far, the bank is best off just assuming all its customers are satisfied.</strong> Hopefully more to come soon.</p>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">end </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> datetime</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">now</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"></span><span class="token keyword" style="color:hsl(301, 63%, 40%)">print</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token string" style="color:hsl(119, 34%, 47%)">"Running time: {}"</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token builtin" style="color:hsl(119, 34%, 47%)">format</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain">end </span><span class="token operator" style="color:hsl(221, 87%, 60%)">-</span><span class="token plain"> start</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-text codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> Running time: 0:00:58.715714</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id=appendix>Appendix<a href=#appendix class=hash-link aria-label="Direct link to Appendix" title="Direct link to Appendix"></a></h2>
<p>Code used to split the initial training data:</p>
<div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-background-color:hsl(230, 1%, 98%);--prism-color:hsl(230, 8%, 24%)"><div class=codeBlockContent_biex><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="background-color:hsl(230, 1%, 98%);color:hsl(230, 8%, 24%)"><code class=codeBlockLines_e6Vv><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token keyword" style="color:hsl(301, 63%, 40%)">from</span><span class="token plain"> sklearn</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">cross_validation </span><span class="token keyword" style="color:hsl(301, 63%, 40%)">import</span><span class="token plain"> train_test_split</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">data </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> pd</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">read_csv</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token string" style="color:hsl(119, 34%, 47%)">'train.csv'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">data</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">index </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> data</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">ID</span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">data_train</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> data_validate </span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token plain"> train_test_split</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain"> data</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">,</span><span class="token plain"> train_size</span><span class="token operator" style="color:hsl(221, 87%, 60%)">=</span><span class="token number" style="color:hsl(35, 99%, 36%)">.7</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain" style=display:inline-block></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">data_train</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">to_csv</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token string" style="color:hsl(119, 34%, 47%)">'split_train.csv'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><span class="token plain"></span><br></span><span class=token-line style="color:hsl(230, 8%, 24%)"><span class="token plain">data_validate</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">.</span><span class="token plain">to_csv</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">(</span><span class="token string" style="color:hsl(119, 34%, 47%)">'split_validate.csv'</span><span class="token punctuation" style="color:hsl(119, 34%, 47%)">)</span><br></span></code></pre><div class=buttonGroup__atx><button type=button aria-label="Copy code to clipboard" title=Copy class=clean-btn><span class=copyButtonIcons_eSgA aria-hidden=true><svg viewBox="0 0 24 24" class=copyButtonIcon_y97N><path fill=currentColor d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"/></svg><svg viewBox="0 0 24 24" class=copyButtonSuccessIcon_LjdS><path fill=currentColor d=M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z /></svg></span></button></div></div></div></div></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href=/2016/02/profitability-using-the-investment-formula><div class=pagination-nav__sublabel>Older post</div><div class=pagination-nav__label>Profitability using the investment formula</div></a><a class="pagination-nav__link pagination-nav__link--next" href=/2016/03/tweet-like-me><div class=pagination-nav__sublabel>Newer post</div><div class=pagination-nav__label>Tweet like me</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href=#data-exploration class="table-of-contents__link toc-highlight">Data Exploration</a><ul><li><a href=#dimensionality-reduction-pt-1---binary-classifiers class="table-of-contents__link toc-highlight">Dimensionality Reduction pt. 1 - Binary Classifiers</a><li><a href=#dimensionality-reduction-pt-2---lda class="table-of-contents__link toc-highlight">Dimensionality Reduction pt. 2 - LDA</a></ul><li><a href=#summary-for-day-1 class="table-of-contents__link toc-highlight">Summary for Day 1</a><li><a href=#appendix class="table-of-contents__link toc-highlight">Appendix</a></ul></div></div></div></div></div><footer class=footer><div class="container container-fluid"><div class="footer__bottom text--center"><div class=footer__copyright>Copyright © 2024 Bradlee Speice</div></div></div></footer></div>