onboarding/r_rstudio/exercise_solutions.md at master · TeamINTERACT/onboarding

Ensure tidyverse, dplyr is installed and loaded:

install.packages("tidyverse")
library("tidyverse")
install.packages("dplyr")
library("dplyr")

CCHS Data:

Read in the data cchs.csv

cchs <- read_csv("csv_file_location/cchs.csv")

Quickly view the first 10 rows of data

head(cchs, 10)

Display the type of variable for all variables in the dataset

sapply(cchs, class)

Create a scatterplot on height and weight

ggplot(cchs, aes(x = hwtghtm, y = hwtgwtk)) + geom_point()

Clean the height and weight data so missing data coded as numbers become NA in R

# Worse
cchs_clean <- cchs %>% mutate(hwtghtm = factor(hwtghtm))
cchs_clean <- cchs_clean %>% mutate(hwtgwtk = factor(hwtgwtk))
cchs_clean <- cchs_clean %>% mutate(hwtghtm = fct_recode(hwtghtm, NULL = "9.996", NULL = "9.999"))

# Better
cchs_clean <- cchs %>% mutate(hwtghtm = case_when(
	hwtghtm > 8.0 ~ NA_real_,
	TRUE ~ hwtghtm
))
cchs_clean <- cchs_clean %>% mutate(hwtgwtk = case_when(
	hwtgwtk > 900.0 ~ NA_real_,
	TRUE ~ hwtgwtk
))

Create a new scatterplot on height and weight with the clean data

ggplot(cchs_clean, aes(x = hwtghtm, y = hwtgwtk)) + geom_point()

Recode BMI to represent weight categories from underweight to obese

cchs_clean <- cchs_clean %>%
	mutate(bmi_category = case_when(
		hwtgbmi < 18.5 ~ "underweight",
		hwtgbmi >=30 & hwtgbmi <999 ~ "obese",
		hwtgbmi >=25 & hwtgbmi <30 ~ "overweight",
		hwtgbmi >=18.5 & hwtgbmi <25 ~ "normal weight",
		TRUE ~ "other"
	))

Recode Province to include the province names instead of numbers

cchs_clean <- cchs_clean %>%
	mutate(geogprv_name = case_when(
		geogprv == 10 ~ "NFLD & LAB",
		geogprv == 11 ~ "PEI",
		geogprv == 12 ~ "NOVA SCOTIA",
		geogprv == 13 ~ "NEW BRUNSWICK",
		geogprv == 24 ~ "QUEBEC",
		geogprv == 35 ~ "ONTARIO",
		geogprv == 46 ~ "MANITOBA",
		geogprv == 47 ~ "SASKATCHEWAN",
		geogprv == 48 ~ "ALBERTA",
		geogprv == 59 ~ "BRITISH COLUMBIA",
		geogprv == 60 ~ "YUKON/NWT/NUNA",
		geogprv == 96 ~ "NOT APPLICABLE",
		geogprv == 97 ~ "DON'T KNOW",
		geogprv == 98 ~ "REFUSAL",
		TRUE ~ "NOT STATED"
	))

Compute the mean and standard deviation of height and weight

summarize(
	cchs_clean, 
	avg_ht = mean(hwtghtm, na.rm = T), 
	sd_ht = sd(hwtghtm, na.rm = T),
	avg_wt = mean(hwtgwtk, na.rm = T), 
	sd_wt = sd(hwtgwtk, na.rm = T)
	)

Compute the mean and standard deviation of height and weight

summarize(
	group_by(cchs_clean, bmi), 
	avg_ht = mean(hwtghtm, na.rm = T), 
	sd_ht = sd(hwtghtm, na.rm = T),
	avg_wt = mean(hwtgwtk, na.rm = T), 
	sd_wt = sd(hwtgwtk, na.rm = T)
	)
summarize(
	group_by(cchs_clean, geogprv_name), 
	avg_ht = mean(hwtghtm, na.rm = T), 
	sd_ht = sd(hwtghtm, na.rm = T),
	avg_wt = mean(hwtgwtk, na.rm = T), 
	sd_wt = sd(hwtgwtk, na.rm = T)
	)

Accel Data

Read in the 2 data files accel.csv accel = read_csv("onboard/accel_data.csv")
Create a new variable that indicates participant 1 and participant 2
Append (stack) the 2 files together
Quickly view the first 10 rows of data
Display the type of variable for all variables in the dataset
Create a scatterplot on x_axis and y_axis
Compute the mean and standard deviation of height and weight
Convert the time data to time format
Compute the sum each of axis by second and by participant
Compute the gravity subtracted vector magnitude sqrt(x^2, y^2, z^2)-1 on the new data for each participant

Database

Open the connection to the interact_demo database, then use tbl() to get a reference to each of the tables in it.

drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "interact_demo",
    host = "yakitori.usask.ca", port = 5432,
    user = rstudioapi::askForPassword("Database user"), 
    password = rstudioapi::askForPassword("Database password"))
demo_accel <- tbl(con, "accel_data")
demo_cchs <- tbl(con, "cchs")

View the first ten rows of each table.

demo_accel %>% head(10)
demo_cchs %>% head(10)

Display rows from the cchs table where the data originates in Saskatchewan.

demo_cchs %>% filter(geogprv == 47)

Display only the caseid column from the cchs table, where the data originates from Ontario

demo_cchs %>% filter(geogprv == 35) %>% select(caseid)

Display the average BMI by province from the cchs table

demo_cchs %>% filter(hwtgbmi < 50) %>%
	group_by(geogprv) %>%
	summarise(avg_bmi = mean(hwtgbmi))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CCHS Data:

Accel Data

Database

FilesExpand file tree

exercise_solutions.md

Latest commit

History

exercise_solutions.md

File metadata and controls

CCHS Data:

Accel Data

Database