1
0
mirror of https://github.com/bspeice/kiva-dig synced 2024-12-04 20:58:09 -05:00

Add initial results for model validation

This commit is contained in:
Bradlee Speice 2016-12-03 10:06:50 -05:00
parent f70a10e6bd
commit ef28ac8427
2 changed files with 184 additions and 51 deletions

View File

@ -3718,7 +3718,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython2", "pygments_lexer": "ipython2",
"version": "2.7.10" "version": "2.7.12"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -367,30 +367,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)\n", "train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)"
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"342"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(train.columns)"
] ]
}, },
{ {
@ -421,8 +398,6 @@
"source": [ "source": [
"train_x = train.drop('bad_loan', axis=1)\n", "train_x = train.drop('bad_loan', axis=1)\n",
"train_y = train['bad_loan']\n", "train_y = train['bad_loan']\n",
"valid_x = valid.drop('bad_loan', axis=1)\n",
"valid_y = valid['bad_loan']\n",
"\n", "\n",
"1 - train_y.mean()" "1 - train_y.mean()"
] ]
@ -436,57 +411,101 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": { "metadata": {
"collapsed": false "collapsed": false
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished training 1\n",
"Finished training 0.1\n",
"Finished training 0.01\n",
"Finished training 10\n"
]
}
],
"source": [ "source": [
"from itertools import product\n", "from itertools import product\n",
"import pickle\n", "import pickle\n",
"from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n",
"\n", "\n",
"svc_params = product([1, .5, 1.5], [.001, .01, .1])\n", "for C in [1, .1, .01, 10]:\n",
" lr = LogisticRegression(C=C)\n",
"\n", "\n",
"for C, gamma in svc_params:\n", " lr.fit(train_x, train_y)\n",
" svc = SVC(C=C, gamma=gamma)\n", " with open('lr_{}.pickle'.format(C), 'w') as handle:\n",
"\n", " pickle.dump(lr, handle)\n",
" svc.fit(train_x, train_y)\n",
" with open('svc_{}_{}.pickle'.format(C, gamma), 'w') as handle:\n",
" pickle.dump(svc, handle)\n",
" \n", " \n",
" print(\"C: {}; gamma: {}; score: {}\".format(\n", " del(lr)\n",
" C, gamma, svc.score(train_x, train_y)))" " print(\"Finished training {}\".format(C))\n",
" #print(\"C: {}; gamma: {}; score: {}\".format(\n",
" # C, gamma, svc.score(valid_x, valid_y)))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": { "metadata": {
"collapsed": true "collapsed": false
}, },
"outputs": [], "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/lib/python2.7/dist-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.\n",
" warnings.warn(\"Variables are collinear.\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished training 342\n",
"Finished training 250\n",
"Finished training 150\n",
"Finished training 75\n"
]
}
],
"source": [ "source": [
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
"import pickle\n",
"\n", "\n",
"# Number of columns is 342\n", "# Number of columns is 342\n",
"for n_components in [342, 250, 150, 75]\n", "for n_components in [342, 250, 150, 75]:\n",
" lda = LinearDiscriminantAnalysis(n_components=n_components)\n", " lda = LinearDiscriminantAnalysis(n_components=n_components)\n",
" lda.fit(train_x, train_y)\n", " lda.fit(train_x, train_y)\n",
" with open('lda_{}.pickle'.format(n_components), 'w') as handle:\n", " with open('lda_{}.pickle'.format(n_components), 'w') as handle:\n",
" pickle.dump(lda, handle)\n", " pickle.dump(lda, handle)\n",
" \n", " \n",
" print(\"N_components: {}; score: {}\".format(\n", " del(lda)\n",
" n_components, lda.score(valid_x, valid_y)))" " print(\"Finished training {}\".format(n_components))\n",
" #print(\"N_components: {}; score: {}\".format(\n",
" # n_components, lda.score(valid_x, valid_y)))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": { "metadata": {
"collapsed": true "collapsed": false
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished training 10\n",
"Finished training 50\n",
"Finished training 75\n",
"Finished training 100\n"
]
}
],
"source": [ "source": [
"from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n",
"\n", "\n",
@ -495,9 +514,123 @@
" rf.fit(train_x, train_y)\n", " rf.fit(train_x, train_y)\n",
" with open('rf_{}.pickle'.format(n_estimators), 'w') as handle:\n", " with open('rf_{}.pickle'.format(n_estimators), 'w') as handle:\n",
" pickle.dump(rf, handle)\n", " pickle.dump(rf, handle)\n",
" \n", " \n",
" print(\"N_estimators: {}; score: {}\".format(\n", " del(rf) \n",
" n_estimators, score(valid_x, valid_y)))" " print(\"Finished training {}\".format(n_estimators))\n",
" #print(\"N_estimators: {}; score: {}\".format(\n",
" # n_estimators, score(valid_x, valid_y)))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)\n",
"\n",
"valid_x = valid.drop('bad_loan', axis=1)\n",
"valid_y = valid['bad_loan']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logistic Regression, C=1; Score: 0.89904155569\n",
"Logistic Regression, C=0.1; Score: 0.89904155569\n",
"Logistic Regression, C=0.01; Score: 0.89904155569\n",
"Logistic Regression, C=10; Score: 0.89904155569\n"
]
}
],
"source": [
"import pickle\n",
"lr_params = [1, .1, .01, 10]\n",
"\n",
"for C in lr_params:\n",
" with open('lr_{}.pickle'.format(C)) as handle:\n",
" model = pickle.load(handle)\n",
" \n",
" score = model.score(valid_x, valid_y)\n",
" \n",
" print('Logistic Regression, C={}; Score: {}'.format(\n",
" C, score\n",
" ))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Linear Discriminant Analysis, components=342; Score: 0.897066249493\n",
"Linear Discriminant Analysis, components=250; Score: 0.897066249493\n",
"Linear Discriminant Analysis, components=150; Score: 0.897066249493\n",
"Linear Discriminant Analysis, components=75; Score: 0.897066249493\n"
]
}
],
"source": [
"lda_components = [342, 250, 150, 75]\n",
"\n",
"for n_components in lda_components:\n",
" with open('lda_{}.pickle'.format(n_components)) as handle:\n",
" model = pickle.load(handle)\n",
" \n",
" score = model.score(valid_x, valid_y)\n",
" \n",
" print('Linear Discriminant Analysis, components={}; Score: {}'.format(\n",
" n_components, score\n",
" ))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Random Forests, estimators: 10; Score: 0.934468350545\n",
"Random Forests, estimators: 50; Score: 0.936468409953\n",
"Random Forests, estimators: 75; Score: 0.936324841332\n",
"Random Forests, estimators: 100; Score: 0.936740695268\n"
]
}
],
"source": [
"rf_estimators = [10, 50, 75, 100]\n",
"\n",
"for estimators in rf_estimators:\n",
" with open('rf_{}.pickle'.format(estimators)) as handle:\n",
" model = pickle.load(handle)\n",
" \n",
" score = model.score(valid_x, valid_y)\n",
" \n",
" print('Random Forests, estimators: {}; Score: {}'.format(\n",
" estimators, score\n",
" ))"
] ]
} }
], ],