diff --git a/Basic Prediction.ipynb b/Basic Prediction.ipynb index c47488b..b9a6ce9 100644 --- a/Basic Prediction.ipynb +++ b/Basic Prediction.ipynb @@ -3718,7 +3718,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.12" } }, "nbformat": 4, diff --git a/Default Prediction.ipynb b/Default Prediction.ipynb index 0a8e6c4..e3aef97 100644 --- a/Default Prediction.ipynb +++ b/Default Prediction.ipynb @@ -367,30 +367,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)\n", - "valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "342" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(train.columns)" + "train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)" ] }, { @@ -421,8 +398,6 @@ "source": [ "train_x = train.drop('bad_loan', axis=1)\n", "train_y = train['bad_loan']\n", - "valid_x = valid.drop('bad_loan', axis=1)\n", - "valid_y = valid['bad_loan']\n", "\n", "1 - train_y.mean()" ] @@ -436,57 +411,101 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished training 1\n", + "Finished training 0.1\n", + "Finished training 0.01\n", + "Finished training 10\n" + ] + } + ], "source": [ "from itertools import product\n", "import pickle\n", - "from sklearn.svm import SVC\n", + "from sklearn.linear_model import LogisticRegression\n", "\n", - "svc_params = product([1, .5, 1.5], [.001, .01, .1])\n", + "for C in [1, .1, .01, 10]:\n", + " lr = LogisticRegression(C=C)\n", "\n", - "for C, gamma in svc_params:\n", - " svc = SVC(C=C, gamma=gamma)\n", - "\n", - " svc.fit(train_x, train_y)\n", - " with open('svc_{}_{}.pickle'.format(C, gamma), 'w') as handle:\n", - " pickle.dump(svc, handle)\n", + " lr.fit(train_x, train_y)\n", + " with open('lr_{}.pickle'.format(C), 'w') as handle:\n", + " pickle.dump(lr, handle)\n", " \n", - " print(\"C: {}; gamma: {}; score: {}\".format(\n", - " C, gamma, svc.score(train_x, train_y)))" + " del(lr)\n", + " print(\"Finished training {}\".format(C))\n", + " #print(\"C: {}; gamma: {}; score: {}\".format(\n", + " # C, gamma, svc.score(valid_x, valid_y)))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/lib/python2.7/dist-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.\n", + " warnings.warn(\"Variables are collinear.\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished training 342\n", + "Finished training 250\n", + "Finished training 150\n", + "Finished training 75\n" + ] + } + ], "source": [ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", + "import pickle\n", "\n", "# Number of columns is 342\n", - "for n_components in [342, 250, 150, 75]\n", + "for n_components in [342, 250, 150, 75]:\n", " lda = LinearDiscriminantAnalysis(n_components=n_components)\n", " lda.fit(train_x, train_y)\n", " with open('lda_{}.pickle'.format(n_components), 'w') as handle:\n", " pickle.dump(lda, handle)\n", " \n", - " print(\"N_components: {}; score: {}\".format(\n", - " n_components, lda.score(valid_x, valid_y)))" + " del(lda)\n", + " print(\"Finished training {}\".format(n_components))\n", + " #print(\"N_components: {}; score: {}\".format(\n", + " # n_components, lda.score(valid_x, valid_y)))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished training 10\n", + "Finished training 50\n", + "Finished training 75\n", + "Finished training 100\n" + ] + } + ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", @@ -495,9 +514,123 @@ " rf.fit(train_x, train_y)\n", " with open('rf_{}.pickle'.format(n_estimators), 'w') as handle:\n", " pickle.dump(rf, handle)\n", - " \n", - " print(\"N_estimators: {}; score: {}\".format(\n", - " n_estimators, score(valid_x, valid_y)))" + " \n", + " del(rf) \n", + " print(\"Finished training {}\".format(n_estimators))\n", + " #print(\"N_estimators: {}; score: {}\".format(\n", + " # n_estimators, score(valid_x, valid_y)))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)\n", + "\n", + "valid_x = valid.drop('bad_loan', axis=1)\n", + "valid_y = valid['bad_loan']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic Regression, C=1; Score: 0.89904155569\n", + "Logistic Regression, C=0.1; Score: 0.89904155569\n", + "Logistic Regression, C=0.01; Score: 0.89904155569\n", + "Logistic Regression, C=10; Score: 0.89904155569\n" + ] + } + ], + "source": [ + "import pickle\n", + "lr_params = [1, .1, .01, 10]\n", + "\n", + "for C in lr_params:\n", + " with open('lr_{}.pickle'.format(C)) as handle:\n", + " model = pickle.load(handle)\n", + " \n", + " score = model.score(valid_x, valid_y)\n", + " \n", + " print('Logistic Regression, C={}; Score: {}'.format(\n", + " C, score\n", + " ))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Linear Discriminant Analysis, components=342; Score: 0.897066249493\n", + "Linear Discriminant Analysis, components=250; Score: 0.897066249493\n", + "Linear Discriminant Analysis, components=150; Score: 0.897066249493\n", + "Linear Discriminant Analysis, components=75; Score: 0.897066249493\n" + ] + } + ], + "source": [ + "lda_components = [342, 250, 150, 75]\n", + "\n", + "for n_components in lda_components:\n", + " with open('lda_{}.pickle'.format(n_components)) as handle:\n", + " model = pickle.load(handle)\n", + " \n", + " score = model.score(valid_x, valid_y)\n", + " \n", + " print('Linear Discriminant Analysis, components={}; Score: {}'.format(\n", + " n_components, score\n", + " ))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forests, estimators: 10; Score: 0.934468350545\n", + "Random Forests, estimators: 50; Score: 0.936468409953\n", + "Random Forests, estimators: 75; Score: 0.936324841332\n", + "Random Forests, estimators: 100; Score: 0.936740695268\n" + ] + } + ], + "source": [ + "rf_estimators = [10, 50, 75, 100]\n", + "\n", + "for estimators in rf_estimators:\n", + " with open('rf_{}.pickle'.format(estimators)) as handle:\n", + " model = pickle.load(handle)\n", + " \n", + " score = model.score(valid_x, valid_y)\n", + " \n", + " print('Random Forests, estimators: {}; Score: {}'.format(\n", + " estimators, score\n", + " ))" ] } ],