mirror of
https://github.com/bspeice/kiva-dig
synced 2024-12-04 12:48:10 -05:00
Add initial results for model validation
This commit is contained in:
parent
f70a10e6bd
commit
ef28ac8427
@ -3718,7 +3718,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython2",
|
"pygments_lexer": "ipython2",
|
||||||
"version": "2.7.10"
|
"version": "2.7.12"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -367,30 +367,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)\n",
|
"train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)"
|
||||||
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"342"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"len(train.columns)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -421,8 +398,6 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"train_x = train.drop('bad_loan', axis=1)\n",
|
"train_x = train.drop('bad_loan', axis=1)\n",
|
||||||
"train_y = train['bad_loan']\n",
|
"train_y = train['bad_loan']\n",
|
||||||
"valid_x = valid.drop('bad_loan', axis=1)\n",
|
|
||||||
"valid_y = valid['bad_loan']\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"1 - train_y.mean()"
|
"1 - train_y.mean()"
|
||||||
]
|
]
|
||||||
@ -436,57 +411,101 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 4,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Finished training 1\n",
|
||||||
|
"Finished training 0.1\n",
|
||||||
|
"Finished training 0.01\n",
|
||||||
|
"Finished training 10\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from itertools import product\n",
|
"from itertools import product\n",
|
||||||
"import pickle\n",
|
"import pickle\n",
|
||||||
"from sklearn.svm import SVC\n",
|
"from sklearn.linear_model import LogisticRegression\n",
|
||||||
"\n",
|
"\n",
|
||||||
"svc_params = product([1, .5, 1.5], [.001, .01, .1])\n",
|
"for C in [1, .1, .01, 10]:\n",
|
||||||
|
" lr = LogisticRegression(C=C)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for C, gamma in svc_params:\n",
|
" lr.fit(train_x, train_y)\n",
|
||||||
" svc = SVC(C=C, gamma=gamma)\n",
|
" with open('lr_{}.pickle'.format(C), 'w') as handle:\n",
|
||||||
"\n",
|
" pickle.dump(lr, handle)\n",
|
||||||
" svc.fit(train_x, train_y)\n",
|
|
||||||
" with open('svc_{}_{}.pickle'.format(C, gamma), 'w') as handle:\n",
|
|
||||||
" pickle.dump(svc, handle)\n",
|
|
||||||
" \n",
|
" \n",
|
||||||
" print(\"C: {}; gamma: {}; score: {}\".format(\n",
|
" del(lr)\n",
|
||||||
" C, gamma, svc.score(train_x, train_y)))"
|
" print(\"Finished training {}\".format(C))\n",
|
||||||
|
" #print(\"C: {}; gamma: {}; score: {}\".format(\n",
|
||||||
|
" # C, gamma, svc.score(valid_x, valid_y)))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 3,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/usr/lib/python2.7/dist-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.\n",
|
||||||
|
" warnings.warn(\"Variables are collinear.\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Finished training 342\n",
|
||||||
|
"Finished training 250\n",
|
||||||
|
"Finished training 150\n",
|
||||||
|
"Finished training 75\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
|
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
|
||||||
|
"import pickle\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Number of columns is 342\n",
|
"# Number of columns is 342\n",
|
||||||
"for n_components in [342, 250, 150, 75]\n",
|
"for n_components in [342, 250, 150, 75]:\n",
|
||||||
" lda = LinearDiscriminantAnalysis(n_components=n_components)\n",
|
" lda = LinearDiscriminantAnalysis(n_components=n_components)\n",
|
||||||
" lda.fit(train_x, train_y)\n",
|
" lda.fit(train_x, train_y)\n",
|
||||||
" with open('lda_{}.pickle'.format(n_components), 'w') as handle:\n",
|
" with open('lda_{}.pickle'.format(n_components), 'w') as handle:\n",
|
||||||
" pickle.dump(lda, handle)\n",
|
" pickle.dump(lda, handle)\n",
|
||||||
" \n",
|
" \n",
|
||||||
" print(\"N_components: {}; score: {}\".format(\n",
|
" del(lda)\n",
|
||||||
" n_components, lda.score(valid_x, valid_y)))"
|
" print(\"Finished training {}\".format(n_components))\n",
|
||||||
|
" #print(\"N_components: {}; score: {}\".format(\n",
|
||||||
|
" # n_components, lda.score(valid_x, valid_y)))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 4,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Finished training 10\n",
|
||||||
|
"Finished training 50\n",
|
||||||
|
"Finished training 75\n",
|
||||||
|
"Finished training 100\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -495,9 +514,123 @@
|
|||||||
" rf.fit(train_x, train_y)\n",
|
" rf.fit(train_x, train_y)\n",
|
||||||
" with open('rf_{}.pickle'.format(n_estimators), 'w') as handle:\n",
|
" with open('rf_{}.pickle'.format(n_estimators), 'w') as handle:\n",
|
||||||
" pickle.dump(rf, handle)\n",
|
" pickle.dump(rf, handle)\n",
|
||||||
" \n",
|
" \n",
|
||||||
" print(\"N_estimators: {}; score: {}\".format(\n",
|
" del(rf) \n",
|
||||||
" n_estimators, score(valid_x, valid_y)))"
|
" print(\"Finished training {}\".format(n_estimators))\n",
|
||||||
|
" #print(\"N_estimators: {}; score: {}\".format(\n",
|
||||||
|
" # n_estimators, score(valid_x, valid_y)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"valid_x = valid.drop('bad_loan', axis=1)\n",
|
||||||
|
"valid_y = valid['bad_loan']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Logistic Regression, C=1; Score: 0.89904155569\n",
|
||||||
|
"Logistic Regression, C=0.1; Score: 0.89904155569\n",
|
||||||
|
"Logistic Regression, C=0.01; Score: 0.89904155569\n",
|
||||||
|
"Logistic Regression, C=10; Score: 0.89904155569\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import pickle\n",
|
||||||
|
"lr_params = [1, .1, .01, 10]\n",
|
||||||
|
"\n",
|
||||||
|
"for C in lr_params:\n",
|
||||||
|
" with open('lr_{}.pickle'.format(C)) as handle:\n",
|
||||||
|
" model = pickle.load(handle)\n",
|
||||||
|
" \n",
|
||||||
|
" score = model.score(valid_x, valid_y)\n",
|
||||||
|
" \n",
|
||||||
|
" print('Logistic Regression, C={}; Score: {}'.format(\n",
|
||||||
|
" C, score\n",
|
||||||
|
" ))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Linear Discriminant Analysis, components=342; Score: 0.897066249493\n",
|
||||||
|
"Linear Discriminant Analysis, components=250; Score: 0.897066249493\n",
|
||||||
|
"Linear Discriminant Analysis, components=150; Score: 0.897066249493\n",
|
||||||
|
"Linear Discriminant Analysis, components=75; Score: 0.897066249493\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"lda_components = [342, 250, 150, 75]\n",
|
||||||
|
"\n",
|
||||||
|
"for n_components in lda_components:\n",
|
||||||
|
" with open('lda_{}.pickle'.format(n_components)) as handle:\n",
|
||||||
|
" model = pickle.load(handle)\n",
|
||||||
|
" \n",
|
||||||
|
" score = model.score(valid_x, valid_y)\n",
|
||||||
|
" \n",
|
||||||
|
" print('Linear Discriminant Analysis, components={}; Score: {}'.format(\n",
|
||||||
|
" n_components, score\n",
|
||||||
|
" ))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Random Forests, estimators: 10; Score: 0.934468350545\n",
|
||||||
|
"Random Forests, estimators: 50; Score: 0.936468409953\n",
|
||||||
|
"Random Forests, estimators: 75; Score: 0.936324841332\n",
|
||||||
|
"Random Forests, estimators: 100; Score: 0.936740695268\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"rf_estimators = [10, 50, 75, 100]\n",
|
||||||
|
"\n",
|
||||||
|
"for estimators in rf_estimators:\n",
|
||||||
|
" with open('rf_{}.pickle'.format(estimators)) as handle:\n",
|
||||||
|
" model = pickle.load(handle)\n",
|
||||||
|
" \n",
|
||||||
|
" score = model.score(valid_x, valid_y)\n",
|
||||||
|
" \n",
|
||||||
|
" print('Random Forests, estimators: {}; Score: {}'.format(\n",
|
||||||
|
" estimators, score\n",
|
||||||
|
" ))"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
Loading…
Reference in New Issue
Block a user