STATISTICS: Multivariate analysis with Bull Riders DAY 3

 

Look into Data

The data can be found here: https://courses.edx.org/c4x/UTAustinX/UT.7.01x/asset/BullRiders.csv

lets us check the data

library(SDSFoundations)
bull<-BullRiders
head(bull)

##           Rider PBRrank YearBorn Height Weight YearsPro Season Events
## 1 Silvano Alves       1     1987     67    150        5   2012     29
## 2  Marco Eguchi       2     1989     67    135        5   2012     29
## 3    Cody Nance       3     1987     68    162        7   2012     28
## 4 Shane Proctor       5     1985     68    165        8   2012     13
## 5 Fabian Vieria       7     1982     68    175        7   2012     23
## 6   J.B. Mauney       8     1987     70    140        8   2012     26
##   BuckOuts Rides CupPoints Place RidePer Wins Top5 Top10 FinalPoints
## 1      103    62  12201.75     1    0.60    2    8    18     1648.00
## 2       84    39   7699.25     9    0.46    1    9    11      571.00
## 3       76    29   5068.25    16    0.38    1    4     7      322.00
## 4       32     8   1387.50    35    0.25    0    0     3        0.00
## 5       70    38   7677.25    10    0.54    2    5    11      171.75
## 6       82    40   9273.25     8    0.49    3    8    13      287.00
##     Earnings
## 1 1464475.61
## 2  226255.50
## 3  131580.54
## 4   52443.96
## 5  191399.08
## 6  313340.27

 

How many of the first 10 riders in the dataset have been pro for more than 10 years?

bull[1:10,]$YearsPro>10
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE

How many rides were completed by the rider with the fewest buck-outs?

min(bull[,9])
## [1] 4

Let us check if there is any correlation between buckouts and years
But before that we need to see if they are linear..because r works
poorly on non-linear curve

plot(bull$YearsPro, bull$BuckOuts)

Much disappointment! Does not seem to have any correlation.. Let us ask for the line that fits this relation

plot(bull$YearsPro, bull$BuckOuts)
abline(lm(bull$BuckOuts~bull$YearsPro))

Ok, lets try our luck on a different variable: Events

plot(bull$Events, bull$BuckOuts)
abline(lm(bull$BuckOuts~bull$Events))

#Correlation function

cor(bull$YearsPro, bull$BuckOuts)
## [1] -0.1670275
cor(bull$Events, bull$BuckOuts)
## [1] 0.9803737

can we make a correlation matrix?

myvars<-c('YearsPro','BuckOuts','Events')
cor(bull[,myvars])
##            YearsPro   BuckOuts     Events
## YearsPro  1.0000000 -0.1670275 -0.1597916
## BuckOuts -0.1670275  1.0000000  0.9803737
## Events   -0.1597916  0.9803737  1.0000000

Top10 vs RidePer

# Visualize and describe the first variable of interest 
hist(bull$RidePer)

fivenum(bull$RidePer)
## [1] 0.19 0.29 0.35 0.49 0.61
mean(bull$RidePer)
## [1] 0.3747368
sd(bull$RidePer)
## [1] 0.121516
# Visualize and describe the second variable of interest 
hist(bull$Top10)

fivenum(bull$Top10)
## [1]  0.0  2.0  6.5  9.0 18.0
mean(bull$Top10)
## [1] 6.236842
sd(bull$Top10)
## [1] 4.611585
# Create a scatterplot
plot(bull$RidePer,bull$Top10)

# Add line of best fit
abline(lm(bull$Top10~bull$RidePer))

# Calculate the correlation coefficient
cor(bull$RidePer,bull$Top10)
## [1] 0.8554679
# Create a correlation matrix  
vars <- c("Top10", "RidePer")
cor(bull[,vars])
##             Top10   RidePer
## Top10   1.0000000 0.8554679
## RidePer 0.8554679 1.0000000
#identify a specific record
which(bull$Top10==5 & bull$RidePer==.53)
## [1] 16

Which variable has the strongest linear relationship with Earnings: Ride Percentage or Cup Points?

hist(bull$Earnings)

fivenum(bull$Earnings)
## [1]   21343.28   55617.33  111147.63  208724.52 1464475.61
mean(bull$Earnings)
## [1] 172444.9
vars<-c("Earnings", "RidePer","CupPoints")
cor(bull[,vars])
##            Earnings   RidePer CupPoints
## Earnings  1.0000000 0.6191194 0.6741746
## RidePer   0.6191194 1.0000000 0.9190072
## CupPoints 0.6741746 0.9190072 1.0000000
plot(bull$RidePer,bull$Earnings)
abline(lm(bull$Earning~bull$RidePer))

plot(bull$CupPoints,bull$Earnings)
abline(lm(bull$Earning~bull$CupPoints))

vars<-c("Earnings", "RidePer", "CupPoints")
cor(bull[,vars])
##            Earnings   RidePer CupPoints
## Earnings  1.0000000 0.6191194 0.6741746
## RidePer   0.6191194 1.0000000 0.9190072
## CupPoints 0.6741746 0.9190072 1.0000000
plot(bull$CupPoints, bull$RidePer)
abline(lm(bull$RidePer~bull$CupPoints))

# identify specific case
which(bull$Earnings == max(bull$Earnings))
## [1] 1
#lets removethe outlier
nooutlier <-bull[-1,]
cor(nooutlier[,vars])
##            Earnings   RidePer CupPoints
## Earnings  1.0000000 0.8144455 0.9036631
## RidePer   0.8144455 1.0000000 0.9100567
## CupPoints 0.9036631 0.9100567 1.0000000

Rides per event

During a professional bull-riding event, riders usually attempt to
ride a bull three or more times. This means that they can record a
“ride” (successfully staying on the bull) multiple times in the same
event.

Create a new variable for the average number of rides per event for each bull rider in the dataset.

#insert a new column
bull["RidesPerEvent"]<-NA
bull$RidesPerEvent<-bull$Rides/bull$Events

Make a histogram of your “rides per event” variable and find the five-number summary for your “rides per event” variable.

hist(bull$RidesPerEvent)

fivenum(bull$RidesPerEvent)
## [1] 0.4545455 0.7500000 1.0000000 1.5384615 2.1379310
mean(bull$RidesPerEvent)
## [1] 1.089309

Create a scatterplot of “rides per event” and yearly ranking (defined by the “Place” variable) and add a line of best fit

plot(bull$RidesPerEvent, bull$Place)
abline(lm(bull$Place~bull$RidesPerEvent))

cor(bull$RidesPerEvent,bull$Place)
## [1] -0.782405


 

 


Leave a comment

Leave a Reply

Your email address will not be published. Required fields are marked *

*