import pandas as pd import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import axes3d import seaborn as sns from sklearn.preprocessing import scale import sklearn.linear_model as skl_lm from sklearn.metrics import mean_squared_error, r2_score import statsmodels.api as sm import statsmodels.formula.api as smf %matplotlib inline plt.style.use('seaborn-white')

a = pd.read_csv('/.../1913-1933nba.csv') # add your location for your file in ... a.head() # You can break the data down into 4 csv files a - 1913-1933; b - 1934-1959; c-1960-1979; and 1980-1997

sns.regplot(a.weight_lbs, a.height_ft, order=1, ci=None, scatter_kws={'color':'g', 's':12}) sns.regplot(b.weight_lbs, b.height_ft, order=1, ci=None, scatter_kws={'color':'r', 's':12}) sns.regplot(c.weight_lbs, c.height_ft, order=1, ci=None, scatter_kws={'color':'b', 's':12}) sns.regplot(d.weight_lbs, d.height_ft, order=1, ci=None, scatter_kws={'color':'y', 's':12}) plt.xlim(140,325) plt.ylim(ymin=5.5); # multiple regression lines and changing the color with letter symbol

regr = skl_lm.LinearRegression() X = a.weight_lbs.values.reshape(-1,1) y = a.height_ft regr.fit(X,y) print(regr.intercept_) print(regr.coef_) # you can run regression coefficient for each dataset

sns.regplot(a.height_ft, a.born, order=1, ci=None, scatter_kws={'color':'g', 's':9}) sns.regplot(b.height_ft, b.born, order=1, ci=None, scatter_kws={'color':'r', 's':9}) sns.regplot(c.height_ft, c.born, order=1, ci=None, scatter_kws={'color':'b', 's':9}) sns.regplot(d.height_ft, d.born, order=1, ci=None, scatter_kws={'color':'y', 's':9}) plt.xlim(5.5, 7.5) plt.ylim(1913, 1997); # green data points and blue line indicate 1913-1933; red data points and orange line indicate 1934-1959 # blue data points and green line indicate 1960-1979; yellow data points and red line indicate 1980-1997

a[['weight_lbs', 'height_ft']].describe() # Run the descriptive statistics for all data sets

# Create a coordinate grid weight_lbs = np.arange(0,50) height_ft = np.arange(0,300) B1, B2 = np.meshgrid(weight_lbs, height_ft, indexing='xy') Z = np.zeros((height_ft.size, weight_lbs.size)) for (i,j),v in np.ndenumerate(Z): Z[i,j] =(regr.intercept_ + B1[i,j]*regr.coef_[0] + B2[i,j]*regr.coef_[1])

# Create plot fig = plt.figure(figsize=(12,8)) fig.suptitle('NBA players born between 1910 - 1997', fontsize=20) ax = axes3d.Axes3D(fig) ax.plot_surface(B1, B2, Z, rstride=10, cstride=5, alpha=0.4) ax.scatter3D(a.weight_lbs, a.height_ft, a.born, c='g') ax.scatter3D(b.weight_lbs, b.height_ft, b.born, c='r') ax.scatter3D(c.weight_lbs, c.height_ft, c.born, c='b') ax.scatter3D(d.weight_lbs, d.height_ft, d.born, c='y') ax.set_xlabel('weight_lbs') ax.set_xlim(350,150) ax.set_ylabel('height_ft') ax.set_ylim(5.5,8) ax.set_zlabel('born') ax.set_zlim(1910,1997);

sns.pairplot(a[['height_ft','weight_lbs']]); sns.pairplot(b[['height_ft','weight_lbs']]); sns.pairplot(c[['height_ft','weight_lbs']]); sns.pairplot(d[['height_ft','weight_lbs']]);

sns.jointplot(x='weight_lbs',y='height_ft',data=a,kind='hex') # interchange the data sets into this code

a = pd.DataFrame(np.random.randn(1000, 2), columns=['height_ft', 'weight_lbs']) a.plot.hexbin(x='height_ft',y='weight_lbs',gridsize=25,cmap='Oranges') # interchange the other data sets into this code