This document summarizes the analyses conducted on the Play & Learning Across a Year (PLAY) project pilot data. The data are stored and shared at https://nyu.databrary.org/volume/444.
knitr::opts_chunk$set(echo = TRUE)
# Load databraryapi package from github.com/PLAY-behaviorome/databraryapi
if (!require(databraryapi)) {
devtools::install_github("PLAY-behaviorome/databraryapi")
library(databraryapi)
}
## Loading required package: databraryapi
## Welcome to the databraryapi package
# Load other libraries
library(tidyverse)
## ── Attaching packages ─────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## ── Conflicts ────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
## Variables for this particular volume and session.
play_vol_id <- 444
survey_data_session_name = "Parent report data"
The Databrary volume is 444.
We need to log in to Databrary to have access to these data. If you have logged-in before from the machine you are running this report, then you will be prompted to enter your Databrary ID (email address). If you have not logged-in before, you will be prompted to enter your Databrary ID (email address) and your Databrary password.
databraryapi::login_db(params$db_account)
## Login successful.
## [1] TRUE
There are several data sources we need to gather and clean. Participant demographic data are stored in the Databrary spreadsheet. Parent report (survey) data are stored in the session with session_name == “Parent report data”. Let’s store that in a variable for later use: `r
We’ll first gather the participant demographic data stored in the Databrary spreadsheet.
play_demog_raw <- databraryapi::download_session_csv(vol_id = play_vol_id)
if (is.null(play_demog_raw)) {
stop("No data downloaded from Databrary. Are you logged in?")
}
# Select relevant spreadsheet columns, filter those with actual session data, and rename variables to use underscores
play_demog <- play_demog_raw %>%
select(session_id, session_release, participant.gender,
participant.race, participant.ethnicity,
participant.gestational.age, participant.birth.weight,
participant.disability, participant.language, group.name,
context.language) %>%
filter(session_release %in% c('EXCERPTS', 'PRIVATE', 'SHARED')) %>%
rename(participant_gender = participant.gender,
participant_race = participant.race,
participant_ethnicity = participant.ethnicity,
participant_gestational_age = participant.gestational.age,
participant_birth_weight = participant.birth.weight,
participant_disability = participant.disability,
participant_language = participant.language,
group_name = group.name,
context_language = context.language)
str(play_demog)
## 'data.frame': 20 obs. of 11 variables:
## $ session_id : int 18801 18803 18805 18806 18807 18808 18810 18811 18813 18814 ...
## $ session_release : Factor w/ 4 levels "","EXCERPTS",..: 2 2 3 2 2 2 2 2 2 2 ...
## $ participant_gender : Factor w/ 3 levels "","Female","Male": 3 3 2 3 3 2 2 2 3 3 ...
## $ participant_race : Factor w/ 5 levels "","Asian","More than one",..: 4 5 3 5 5 5 5 5 5 5 ...
## $ participant_ethnicity : Factor w/ 4 levels "","Hispanic or Latino",..: 4 3 2 3 3 3 3 2 3 3 ...
## $ participant_gestational_age: num NA 40 39.5 40 40 38 40 40 41 41 ...
## $ participant_birth_weight : num NA 8.25 8 7 8.38 ...
## $ participant_disability : Factor w/ 4 levels "","Acid Reflux",..: 4 3 3 3 3 3 3 3 3 3 ...
## $ participant_language : Factor w/ 9 levels "","English","English, German",..: 9 6 9 2 2 2 7 4 3 2 ...
## $ group_name : Factor w/ 4 levels "","12 mos","18 mos",..: 3 3 3 3 4 3 3 2 3 3 ...
## $ context_language : Factor w/ 3 levels "","English","English, Spanish": 2 2 2 2 2 2 2 2 2 2 ...
First, we need to list the data files available in the session where the parent report data are stored.
parent_rpt_session_id <- play_demog_raw %>%
filter(session_name == survey_data_session_name) %>%
select(session_id) %>%
as.numeric()
Now, we list the data files in that session.
surveys <- list_assets_in_session(vol_id = play_vol_id,
session_id = parent_rpt_session_id) %>%
select(asset_id, name)
knitr::kable(surveys)
| asset_id | name |
|---|---|
| 159177 | childcare |
| 117092 | locomotion |
| 116791 | child-birth |
| 116790 | family |
| 116789 | sleep |
| 116787 | language-exposure |
Let’s download each of these separately.
childcare_df <- read_csv_data_as_df(session_id = parent_rpt_session_id,
asset_id = surveys$asset_id[1])
## No encoding supplied: defaulting to UTF-8.
loco_df <- read_csv_data_as_df(session_id = parent_rpt_session_id, asset_id = surveys$asset_id[2])
## No encoding supplied: defaulting to UTF-8.
childbirth_df <- read_csv_data_as_df(session_id = parent_rpt_session_id,
asset_id = surveys$asset_id[3])
## No encoding supplied: defaulting to UTF-8.
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
family_df <- read_csv_data_as_df(session_id = parent_rpt_session_id,
asset_id = surveys$asset_id[4])
## No encoding supplied: defaulting to UTF-8.
sleep_df <- read_csv_data_as_df(session_id = parent_rpt_session_id,
asset_id = surveys$asset_id[5])
## No encoding supplied: defaulting to UTF-8.
lang_exp_df <- read_csv_data_as_df(session_id = parent_rpt_session_id,
asset_id = surveys$asset_id[6])
## No encoding supplied: defaulting to UTF-8.
str(loco_df)
## 'data.frame': 17 obs. of 7 variables:
## $ id : int 2 3 5 6 7 9 10 11 12 13 ...
## $ hkcrawl_onset_date: chr "8/13/15" "7/31/15" "5/2/15" "3/10/16" ...
## $ hkcrawl_onset_mos : num 7.53 7.03 8.48 12.59 7.2 ...
## $ walk_onset_date : chr "12/5/15" "11/30/15" "8/4/15" "7/9/16" ...
## $ walk_onset_mos : num 11.3 11.1 11.6 16.6 11.7 ...
## $ walk_onset_src : chr "phone" NA "phone" "babybook" ...
## $ Interview.Comments: chr "" "" "" "" ...
str(childbirth_df)
## 'data.frame': 11 obs. of 11 variables:
## $ id : int 2 3 5 6 7 9 10 11 12 13 ...
## $ bdate_child : chr "12/27/14" "12/29/14" "8/17/14" "2/21/15" ...
## $ born_duedate : chr "yes" "yes" "yes" "yes" ...
## $ child_age : num 18.3 18.3 24.3 18.2 17.8 ...
## $ agegroup : int 18 18 24 18 18 18 18 18 24 12 ...
## $ birth_weight_pounds : int 8 8 8 8 7 6 8 6 6 8 ...
## $ birth_weight_ounces : num 14 0.5 6 2 4 9 3 12 0 9 ...
## $ newborn_complications_yn : chr "no" "no" "no" "no" ...
## $ newborn_complications : chr NA NA NA NA ...
## $ illness_diagnosis_yn : chr "no" "no" "no" "no" ...
## $ illness_diagnosis_details: chr NA NA NA NA ...
str(family_df)
## 'data.frame': 17 obs. of 21 variables:
## $ id : int 2 3 5 6 7 9 10 11 12 13 ...
## $ mom_bdate : chr "12/23/75" "1/9/81" "6/30/69" "1/23/82" ...
## $ mom_race : chr "5" "7" "5" "5" ...
## $ mom_ethnicity : chr "no" "yes" "no" "no" ...
## $ caregiver_country : int 4 4 1 1 1 4 1 1 1 1 ...
## $ mom_education : int 22 19 23 19 22 21 21 17 22 22 ...
## $ mom_working : chr "ft" "ft" "pt" "ft" ...
## $ mom_occupation : chr "Assistant professor" NA "Lawyer" "Editor for academic journal" ...
## $ mom_training : chr "no" "no" "no" "yes" ...
## $ caregiver_us_entry: chr "1/1/87" "1/1/89" NA NA ...
## $ partner_cohabitate: chr "yes" "yes" "yes" "yes" ...
## $ partner_race : chr "5" "7" "5" "5" ...
## $ partner_ethnicity : chr "no" "yes" "no" "no" ...
## $ partner_education : int 21 21 21 19 22 NA 21 17 23 21 ...
## $ partner_working : chr "ft" "ft" "ft" "ft" ...
## $ partner_training : chr "no" "yes" "no" "no" ...
## $ partner_occupation: chr "Finance, Asset Manager" NA "Manager" "Sales manager" ...
## $ partner_dob : chr NA "9/14/82" "3/29/80" "10/24/85" ...
## $ family_sib_dob : chr NA NA NA NA ...
## $ brothers : int NA NA NA NA NA NA NA NA 0 0 ...
## $ siblings : int NA NA NA NA NA NA NA NA 0 0 ...
str(sleep_df)
## 'data.frame': 17 obs. of 5 variables:
## $ id : int 2 3 5 6 7 9 10 11 12 13 ...
## $ sleep.time : chr "20:30:00" "20:45:00" "19:30:00" "19:45:00" ...
## $ wake.time : chr "07:00:00" "07:20:00" "06:30:00" "07:15:00" ...
## $ naps.hours : num 2 2 2.5 2 2 2 2.5 1.5 2 2.5 ...
## $ sleeping.loc: chr "crib_separate" "crib_separate" "toddlerbed" "crib_separate" ...
str(childcare_df)
## 'data.frame': 17 obs. of 5 variables:
## $ id : int 2 3 5 6 7 9 10 11 12 13 ...
## $ childcare_alt : chr "person" "center" "both" "center" ...
## $ childcare_hours : int 60 40 52 50 NA 50 29 21 35 20 ...
## $ childcare_other_children: int 1 11 15 9 NA 0 11 3 9 0 ...
## $ childcare_init_age : num 0 8 12 3 NA 4 1.5 14 1 6 ...
str(lang_exp_df)
## 'data.frame': 66 obs. of 3 variables:
## $ id : int 2 2 3 5 6 9 10 11 11 12 ...
## $ language : chr "Russian" "Armenian" "English" "English" ...
## $ exposure_context: chr "childcare" "childcare" "childcare" "childcare" ...
Let’s set some plot thematic elements.
play.palette <- scale_fill_manual(values=c("blue2", "firebrick2", "chartreuse2", "darkorchid2"))
play.theme <-
theme_classic() +
theme(legend.position = "bottom",
legend.title = element_blank(),
legend.text = element_text(size = rel(1.2)),
axis.title = element_text(size = rel(1.5), face ="bold"),
axis.text.x = element_text(size = rel(1.2)),
axis.text.y = element_text(size = rel(1.2)),
axis.line = element_blank(),
axis.title.y = element_blank())
play_demog %>%
ggplot(.) +
aes(x = participant_gestational_age, y = participant_birth_weight, color = participant_race) +
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
These are some illustrative plots.
We have to convert the sleep.time and wake.time variables so that they can be plotted. It looks like the lubridate package is a good option.
# sleep <- lubridate::ymd_hms("2001-01-01 20:30:00")
# awake <- lubridate::ymd_hms("2001-01-02 07:00:00")
# (night_hrs <- awake-sleep)
# Helper functions for conversion
sleep_time <- function(hhmmss){
lubridate::ymd_hms(paste0("2001-01-01 ", hhmmss))
}
wake_time <- function(hhmmss){
# One date later than sleep time
lubridate::ymd_hms(paste0("2001-01-02 ", hhmmss))
}
time_asleep <- function(sleep_tm, awake_tm){
wake_time(awake_tm) - sleep_time(sleep_tm)
}
sleep_df <- sleep_df %>%
mutate(sleep_tm = sleep_time(sleep.time),
wake_tm = wake_time(wake.time),
nightsleep_hrs = wake_tm - sleep_tm)
Now, we can plot.
sleep_df %>%
ggplot(.) +
aes(x = nightsleep_hrs) +
geom_histogram(bins = 10) +
play.theme
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
childcare_df %>%
ggplot(.) +
aes(x = childcare_hours, fill = childcare_alt) +
geom_histogram(bins = 30) +
play.theme
## Warning: Removed 2 rows containing non-finite values (stat_bin).
lang_exp_df %>%
ggplot(.) +
aes(x = language) +
facet_grid(exposure_context ~ .) +
geom_histogram(stat='count') +
play.theme
## Warning: Ignoring unknown parameters: binwidth, bins, pad
loco_df %>%
gather(key = milestone, value = age.mos,
hkcrawl_onset_mos, walk_onset_mos) %>%
mutate(milestone = factor(milestone,
labels = c("crawl", "walk"))) %>%
ggplot() +
aes(x = age.mos, y = id) +
geom_point(aes(shape = milestone, color = milestone)) +
geom_line(aes(group = id)) +
xlab("Age (months)") +
ylab("Participant ID") +
geom_rug(aes(x = age.mos, group = milestone, color = milestone),
sides = "b") +
play.theme
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_path).