#class.py # # by Joe Hahn, jhahn@spacescience.org, 30 January 2014. # # linear regression #to execute in ipython: > ipython --pylab In [1]: %run class.py #import modules used below import numpy as np import pylab as pl import pandas as pd import matplotlib.pyplot as plt #read grades grades = pd.DataFrame.from_csv('grades.csv') #sum the maximum possible score on the five assigments max_grades = grades[grades.index.values == 'max score'] max_assignment = max_grades.assignment1.values[0] + max_grades.assignment2.values[0 ] + max_grades.assignment3.values[0] + max_grades.assignment4.values[0] + max_grades.assignment5.values[0] #trim the last two rows and sort by ID number grades = grades[grades.index.values != 'max score'] grades = grades.dropna(how='all') grades['id'] = grades.index.values.astype(np.int) grades.sort(column='id', inplace=True) #create column for average assignment score grades['avg_assignment'] = 100.0*(grades.assignment1 + grades.assignment2 + grades.assignment3 + grades.assignment4 + grades.assignment5)/max_assignment #extract the 3 features to be considered here: student id, average homework score, #number of website views, and the target class = semester grade id = grades.id.values homework = grades.avg_assignment.values views = grades.pageviews.values grade = grades.FinalScore.values #plot semester grade versus 3 features id_rng = [0, 100] hw_rng = [35, 110] view_rng = [0, 600] grade_rng = [60, 105] plt.close() plt.figure(figsize=(13, 5.0)) plt.rcParams.update({'font.size': 12}) ax1 = plt.subplot(1, 3, 1) ax1.set_xlabel('avg homework grade') ax1.set_ylabel('semester grade') ax1.set_xlim(hw_rng) ax1.set_ylim(grade_rng) ax1.plot(homework, grade, marker='o', markersize=6.0, color='blue', linestyle='None', label='data') ax2 = plt.subplot(1, 3, 2) ax2.set_xlabel('class website page views') ax2.set_ylabel('semester grade') ax2.set_xlim(view_rng) ax2.set_ylim(grade_rng) ax2.plot(views, grade, marker='o', markersize=7.0, color='blue', linestyle='None', label='data') ax3 = plt.subplot(1, 3, 3) ax3.set_xlabel('student ID') ax3.set_ylabel('semester grade') ax3.set_xlim(id_rng) ax3.set_ylim(grade_rng) ax3.plot(id, grade, marker='o', markersize=8.0, color='blue', linestyle='None', label='data') plt.show(block=False) plt.draw() plt.tight_layout() #create the usual x = array of each student's three features and target y = semester grade, #normalize input data, and then split into training and testing groups from sklearn.cross_validation import * from sklearn.preprocessing import StandardScaler from sklearn import linear_model, metrics x = np.array([[homework[j], views[j], id[j]] for j in range(len(id))]) y = grade scaler_x = StandardScaler().fit(x) scaler_y = StandardScaler().fit(y) xn = scaler_x.transform(x) yn = scaler_y.transform(y) xn_train, xn_test, yn_train, yn_test = train_test_split(xn, yn, test_size=0.25, random_state=11) #train the model on the training dataset, #then calculate R2 score = fraction of outcome variation explained by this model clf = linear_model.LinearRegression() clf.fit(xn_train, yn_train) print 'coefficient of determination R2 (linear regression) = ', clf.score(xn_train, yn_train) print 'coefficients for linear regression = ', clf.coef_ #apply the model to randomly generated students Np = 100 homework_ran = np.random.uniform(hw_rng[0], hw_rng[1], size=Np) views_ran = np.random.uniform(view_rng[0], view_rng[1], size=Np) id_ran = np.random.uniform(id_rng[0], id_rng[1], size=Np) x_ran = np.array([[homework_ran[j], views_ran[j], id_ran[j]] for j in range(Np)]) xn_ran = scaler_x.transform(x_ran) yn_predict = clf.predict(xn_ran) y_predict = scaler_y.inverse_transform(yn_predict) ax1.plot(homework_ran, y_predict, marker='.', markersize=4.0, color='red', linestyle='None', label='linear') ax2.plot(views_ran, y_predict, marker='.', markersize=4.0, color='red', linestyle='None', label='linear') ax3.plot(id_ran, y_predict, marker='.', markersize=4.0, color='red', linestyle='None', label='linear') ax1.legend(loc='lower right') ax2.legend(loc='lower right') ax3.legend(loc='lower right') plt.show(block=False) plt.draw() #use SVM to do the regression from sklearn import svm clf_svr = svm.SVR(kernel='rbf', gamma=0.0) clf_svr.fit(xn_train, yn_train) print 'coefficient of determination R2 (SVM) = ', clf_svr.score(xn_train, yn_train) homework_ran = np.random.uniform(hw_rng[0], hw_rng[1], size=Np) views_ran = np.random.uniform(view_rng[0], view_rng[1], size=Np) id_ran = np.random.uniform(id_rng[0], id_rng[1], size=Np) x_ran = np.array([[homework_ran[j], views_ran[j], id_ran[j]] for j in range(Np)]) xn_ran = scaler_x.transform(x_ran) yn_predict = clf_svr.predict(xn_ran) y_predict = scaler_y.inverse_transform(yn_predict) ax1.plot(homework_ran, y_predict, marker='.', markersize=4.0, color='green', linestyle='None', label='svm') ax2.plot(views_ran, y_predict, marker='.', markersize=4.0, color='green', linestyle='None', label='svm') ax3.plot(id_ran, y_predict, marker='.', markersize=4.0, color='green', linestyle='None', label='svm') ax1.legend(loc='lower right') ax2.legend(loc='lower right') ax3.legend(loc='lower right') plt.show(block=False) plt.draw() plt.savefig('class.png', dpi=200) #select two most imporant features (homework & views) and train on them only...doesn't improve things tho #from sklearn import feature_selection #fs = feature_selection.SelectKBest(feature_selection.f_regression, k=2) #xn_train_fs = fs.fit_transform(xn_train, yn_train) #clf_svr.fit(xn_train_fs, yn_train) #print 'coefficient of determination R2 (SVM, k=2) = ', clf_svr.score(xn_train_fs, yn_train) #xn_ran_fs = fs.fit_transform(xn_ran, yn_predict) #yn_predict_fs = clf_svr.predict(xn_ran_fs) #y_predict_fs = scaler_y.inverse_transform(yn_predict_fs) #ax1.plot(homework_ran, y_predict_fs, marker='.', markersize=4.0, color='black', linestyle='None', label='svm/fs') #ax2.plot(views_ran, y_predict_fs, marker='.', markersize=4.0, color='black', linestyle='None', label='svm/fs') #plt.show(block=False) #plt.draw()