mirror of
https://github.com/bspeice/kiva-dig
synced 2025-01-15 00:20:04 -05:00
Add initial results for model validation
This commit is contained in:
parent
f70a10e6bd
commit
ef28ac8427
@ -3718,7 +3718,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.10"
|
||||
"version": "2.7.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -367,30 +367,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)\n",
|
||||
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"342"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(train.columns)"
|
||||
"train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -421,8 +398,6 @@
|
||||
"source": [
|
||||
"train_x = train.drop('bad_loan', axis=1)\n",
|
||||
"train_y = train['bad_loan']\n",
|
||||
"valid_x = valid.drop('bad_loan', axis=1)\n",
|
||||
"valid_y = valid['bad_loan']\n",
|
||||
"\n",
|
||||
"1 - train_y.mean()"
|
||||
]
|
||||
@ -436,57 +411,101 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Finished training 1\n",
|
||||
"Finished training 0.1\n",
|
||||
"Finished training 0.01\n",
|
||||
"Finished training 10\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from itertools import product\n",
|
||||
"import pickle\n",
|
||||
"from sklearn.svm import SVC\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"\n",
|
||||
"svc_params = product([1, .5, 1.5], [.001, .01, .1])\n",
|
||||
"for C in [1, .1, .01, 10]:\n",
|
||||
" lr = LogisticRegression(C=C)\n",
|
||||
"\n",
|
||||
"for C, gamma in svc_params:\n",
|
||||
" svc = SVC(C=C, gamma=gamma)\n",
|
||||
"\n",
|
||||
" svc.fit(train_x, train_y)\n",
|
||||
" with open('svc_{}_{}.pickle'.format(C, gamma), 'w') as handle:\n",
|
||||
" pickle.dump(svc, handle)\n",
|
||||
" lr.fit(train_x, train_y)\n",
|
||||
" with open('lr_{}.pickle'.format(C), 'w') as handle:\n",
|
||||
" pickle.dump(lr, handle)\n",
|
||||
" \n",
|
||||
" print(\"C: {}; gamma: {}; score: {}\".format(\n",
|
||||
" C, gamma, svc.score(train_x, train_y)))"
|
||||
" del(lr)\n",
|
||||
" print(\"Finished training {}\".format(C))\n",
|
||||
" #print(\"C: {}; gamma: {}; score: {}\".format(\n",
|
||||
" # C, gamma, svc.score(valid_x, valid_y)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/usr/lib/python2.7/dist-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.\n",
|
||||
" warnings.warn(\"Variables are collinear.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Finished training 342\n",
|
||||
"Finished training 250\n",
|
||||
"Finished training 150\n",
|
||||
"Finished training 75\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
|
||||
"import pickle\n",
|
||||
"\n",
|
||||
"# Number of columns is 342\n",
|
||||
"for n_components in [342, 250, 150, 75]\n",
|
||||
"for n_components in [342, 250, 150, 75]:\n",
|
||||
" lda = LinearDiscriminantAnalysis(n_components=n_components)\n",
|
||||
" lda.fit(train_x, train_y)\n",
|
||||
" with open('lda_{}.pickle'.format(n_components), 'w') as handle:\n",
|
||||
" pickle.dump(lda, handle)\n",
|
||||
" \n",
|
||||
" print(\"N_components: {}; score: {}\".format(\n",
|
||||
" n_components, lda.score(valid_x, valid_y)))"
|
||||
" del(lda)\n",
|
||||
" print(\"Finished training {}\".format(n_components))\n",
|
||||
" #print(\"N_components: {}; score: {}\".format(\n",
|
||||
" # n_components, lda.score(valid_x, valid_y)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Finished training 10\n",
|
||||
"Finished training 50\n",
|
||||
"Finished training 75\n",
|
||||
"Finished training 100\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"\n",
|
||||
@ -495,9 +514,123 @@
|
||||
" rf.fit(train_x, train_y)\n",
|
||||
" with open('rf_{}.pickle'.format(n_estimators), 'w') as handle:\n",
|
||||
" pickle.dump(rf, handle)\n",
|
||||
" \n",
|
||||
" print(\"N_estimators: {}; score: {}\".format(\n",
|
||||
" n_estimators, score(valid_x, valid_y)))"
|
||||
" \n",
|
||||
" del(rf) \n",
|
||||
" print(\"Finished training {}\".format(n_estimators))\n",
|
||||
" #print(\"N_estimators: {}; score: {}\".format(\n",
|
||||
" # n_estimators, score(valid_x, valid_y)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)\n",
|
||||
"\n",
|
||||
"valid_x = valid.drop('bad_loan', axis=1)\n",
|
||||
"valid_y = valid['bad_loan']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Logistic Regression, C=1; Score: 0.89904155569\n",
|
||||
"Logistic Regression, C=0.1; Score: 0.89904155569\n",
|
||||
"Logistic Regression, C=0.01; Score: 0.89904155569\n",
|
||||
"Logistic Regression, C=10; Score: 0.89904155569\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pickle\n",
|
||||
"lr_params = [1, .1, .01, 10]\n",
|
||||
"\n",
|
||||
"for C in lr_params:\n",
|
||||
" with open('lr_{}.pickle'.format(C)) as handle:\n",
|
||||
" model = pickle.load(handle)\n",
|
||||
" \n",
|
||||
" score = model.score(valid_x, valid_y)\n",
|
||||
" \n",
|
||||
" print('Logistic Regression, C={}; Score: {}'.format(\n",
|
||||
" C, score\n",
|
||||
" ))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Linear Discriminant Analysis, components=342; Score: 0.897066249493\n",
|
||||
"Linear Discriminant Analysis, components=250; Score: 0.897066249493\n",
|
||||
"Linear Discriminant Analysis, components=150; Score: 0.897066249493\n",
|
||||
"Linear Discriminant Analysis, components=75; Score: 0.897066249493\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"lda_components = [342, 250, 150, 75]\n",
|
||||
"\n",
|
||||
"for n_components in lda_components:\n",
|
||||
" with open('lda_{}.pickle'.format(n_components)) as handle:\n",
|
||||
" model = pickle.load(handle)\n",
|
||||
" \n",
|
||||
" score = model.score(valid_x, valid_y)\n",
|
||||
" \n",
|
||||
" print('Linear Discriminant Analysis, components={}; Score: {}'.format(\n",
|
||||
" n_components, score\n",
|
||||
" ))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Random Forests, estimators: 10; Score: 0.934468350545\n",
|
||||
"Random Forests, estimators: 50; Score: 0.936468409953\n",
|
||||
"Random Forests, estimators: 75; Score: 0.936324841332\n",
|
||||
"Random Forests, estimators: 100; Score: 0.936740695268\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"rf_estimators = [10, 50, 75, 100]\n",
|
||||
"\n",
|
||||
"for estimators in rf_estimators:\n",
|
||||
" with open('rf_{}.pickle'.format(estimators)) as handle:\n",
|
||||
" model = pickle.load(handle)\n",
|
||||
" \n",
|
||||
" score = model.score(valid_x, valid_y)\n",
|
||||
" \n",
|
||||
" print('Random Forests, estimators: {}; Score: {}'.format(\n",
|
||||
" estimators, score\n",
|
||||
" ))"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
Loading…
Reference in New Issue
Block a user