"use strict";(self.webpackChunkspeice_io=self.webpackChunkspeice_io||[]).push([["302"],{10715:function(e,n,t){t.r(n),t.d(n,{assets:function(){returnl},contentTitle:function(){returno},default:function(){returnh},frontMatter:function(){returnr},metadata:function(){returns},toc:function(){returnc}});vars=t(80414),a=t(85893),i=t(50065);letr={slug:"2016/03/predicting-santander-customer-happiness",title:"Predicting Santander customer happiness",date:newDate("2016-03-05T12:00:00.000Z"),authors:["bspeice"],tags:[]},o=void0,l={authorsImageUrls:[void0]},c=[{value:"Data Exploration",id:"data-exploration",level:2},{value:"Dimensionality Reduction pt. 1 - Binary Classifiers",id:"dimensionality-reduction-pt-1---binary-classifiers",level:3},{value:"Dimensionality Reduction pt. 2 - LDA",id:"dimensionality-reduction-pt-2---lda",level:3},{value:"Summary for Day 1",id:"summary-for-day-1",level:2},{value:"Appendix",id:"appendix",level:2}];functiond(e){letn={a:"a",annotation:"annotation",code:"code",h2:"h2",h3:"h3",img:"img",li:"li",math:"math",mi:"mi",mn:"mn",mo:"mo",mrow:"mrow",p:"p",pre:"pre",semantics:"semantics",span:"span",strong:"strong",ul:"ul",...(0,i.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(n.p,{children:"My first Kaggle competition."}),"\n",(0,a.jsxs)(n.p,{children:["It's time! After embarking on a Machine Learning class this semester, and with a Saturday in which I don't have much planned, I wanted to put this class and training to work. It's my first competition submission. I want to walk you guys through how I'm approaching this problem, because I thought it would be really neat. The competition is Banco Santander's ",(0,a.jsx)(n.a,{href:"https://www.kaggle.com/c/santander-customer-satisfaction",children:"Santander Customer Satisfaction"})," competition. It seemed like an easy enough problem I could actually make decent progress on it."]}),"\n",(0,a.jsx)(n.h2,{id:"data-exploration",children:"Data Exploration"}),"\n",(0,a.jsxs)(n.p,{children:["First up: we need to load our data and do some exploratory work. Because we're going to be using this data for model selection prior to testing, we need to make a further split. I've already gone ahead and done this work, please see the code in the ",(0,a.jsx)(n.a,{href:"#appendix",children:"appendix below"}),"."]}),"\n",(0,a.jsx)(n.pre,{children:(0,a.jsx)(n.code,{className:"language-python",children:"import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n%matplotlib inline\n\n# Record how long it takes to run the notebook - I'm curious.\nfrom datetime import datetime\nstart = datetime.now()\n\ndataset = pd.read_csv('split_train.csv')\ndataset.index = dataset.ID\nX = dataset.drop(['TARGET', 'ID', 'ID.1'], 1)\ny = dataset.TARGET\n"})}),"\n",(0,a.jsx)(n.pre,{children:(0,a.jsx)(n.code,{className:"language-python",children:"y.unique()\n"})}),"\n",(0,a.jsx)(n.pre,{children:(0,a.jsx)(n.code,{children:" array([0, 1], dtype=int64)\n"})}),"\n",(0,a.jsx)(n.pre,{children:(0,a.jsx)(n.code,{className:"language-python",children:"len(X.columns)\n"})}),"\n",(0,a.jsx)(n.pre,{children:(0,a.jsx)(n.code,{children:" 369\n"})}),"\n",(0,a.jsxs)(n.p,{children:["Okay, so there are only ",(0,a.jsx)(n.a,{href:"https://www.kaggle.com/c/santander-customer-satisfaction/data",children:"two classes we're predicting"}),": 1 for unsatisfied customers, 0 for satisfied customers. I would have preferred this to be something more like a regression, or predicting multiple classes: maybe the customer isn't the most happy, but is nowhere near closing their accounts. For now though, that's just the data we're working with."]}),"\n",(0,a.jsx)(n.p,{children:"Now, I'd like to make a scatter matrix of everything going on. Unfortunately as noted above, we have 369 different features. There's no way I can graphically make sense of that much data to start with."}),"\n",(0,a.jsx)(n.p,{children:"We'realsonottoldwhatthedataactuallyrepresents:Arethesesurveyresults?Averagetimebetweencontactwithacustomercareperson?Frequencyofcontactingacustomercare