Tidyverse学习笔记

Tidyverse

作者

Qingyao Zhang

发布于

2026年4月14日

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)
# 使用Keng中的depress数据进行演示
data("depress", package = "Keng")

1 %>% versus |>

对于R语言4.1之前的版本,tidyverse依赖magrittr中的管道操作符%>%。对于R语言4.1及以后的版本,R base中包含自己的管道运算符|>。然而,%>%|>的功能不同。

1.1 |>的用法

R base中的|>的功能比较简单,运行?'|>'查看|>的文档。|>的用法有以下几种:

  1. 常规用法,将|>左侧对象传递给右侧函数的第一个参数。
# simple uses:
# same as pull(depress, gender)
depress |> 
  pull(gender) 
##   [1]  1  2  2  1  2  1  1  2  2  2  1  1  2  2  1  2  1  1  2  1  2  1  2  1  2
##  [26]  2  1  2  1  2  1  1  2  1  2  2  1  2  1  2  1  1  1  2  2  1  2  2  2  1
##  [51] NA NA  1  1  2  1  1  2  2  1  1  1  2  1  1  2  2  1  1  1  2  2 NA  1  1
##  [76]  2  2  2  1  2  1  1  2  1  1  1  1  1  2  1  2 NA  1  1  2  1  1  2  1  1
## [101]  2  1  2  1  2  2  1  2  1  1  2  1  2  1  1  2  2  2  1  2  1  2  1  1  2
## [126]  1  2  1  2  1  1  2  2  2  2  1  1  1  1  2  2  1  1  2  1  1  1  2  1  2
## [151]  2  1  1  2  2  1  1  2  2  1  1  2  2  1  1  1  2  1  2  1  1  2  1  1  1
## [176]  1  2  2  1  1  1 NA NA  2  1
# same as nrow(subset(depress, gender == 0))
depress |> 
  subset(gender == 0) |> 
  nrow()  
## [1] 0
  1. |>左侧对象传递给右侧函数的其他参数。
  1. 使用_ 占位符指代|>左侧对象
注记_占位符的使用规则
  1. _默认内隐地传递给|>右侧函数的第一个参数;当_以外显的方式传递时,_的默认传递将被取消
  2. 只能使用参数名传递_
  3. 只能使用一个_ 4. 不能在嵌套函数中使用_。在嵌套函数中需使用dplyr::%>%
# valid use
depress |> 
  subset(class == 3) |> 
  t.test(attach_anx ~ gender, data = _)
## 
##  Welch Two Sample t-test
## 
## data:  attach_anx by gender
## t = 0.27149, df = 48.987, p-value = 0.7872
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.6520605  0.8557642
## sample estimates:
## mean in group 1 mean in group 2 
##        2.296296        2.194444

下面的用法不可行:

depress |> 
  summary(select(.data = _, gender))
  1. 使用匿名函数
depress |> 
  subset(class == 1) |> 
  (function(d) t.test(depr1 ~ gender, data = d))()
## 
##  Welch Two Sample t-test
## 
## data:  depr1 by gender
## t = -2.379, df = 41.406, p-value = 0.02205
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.44829941 -0.03670396
## sample estimates:
## mean in group 1 mean in group 2 
##        1.839583        2.082085
depress |> 
  subset(class == 2) |> 
  (\(d) t.test(depr1 ~ gender, data = d))()
## 
##  Welch Two Sample t-test
## 
## data:  depr1 by gender
## t = -0.75249, df = 29.638, p-value = 0.4577
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.3367131  0.1554611
## sample estimates:
## mean in group 1 mean in group 2 
##        2.027231        2.117857
  1. |>右侧函数使用参数名传递参数值,从而跳过目标参数之前的参数
depress |> 
  subset(class == 3, !is.na(gender)) |> 
  t.test(formula = depr1 ~ gender)
## Warning in mean.default(x): argument is not numeric or logical: returning NA
## Warning in var(x): NAs introduced by coercion
## 
##  One Sample t-test
## 
## data:  subset(depress, class == 3, !is.na(gender))
## t = NA, df = 16622, p-value = NA
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  NA NA
## sample estimates:
## mean of x 
##        NA
  1. |>结合使用_占位符提取数据子集
depress |> 
  _$gender
##   [1]  1  2  2  1  2  1  1  2  2  2  1  1  2  2  1  2  1  1  2  1  2  1  2  1  2
##  [26]  2  1  2  1  2  1  1  2  1  2  2  1  2  1  2  1  1  1  2  2  1  2  2  2  1
##  [51] NA NA  1  1  2  1  1  2  2  1  1  1  2  1  1  2  2  1  1  1  2  2 NA  1  1
##  [76]  2  2  2  1  2  1  1  2  1  1  1  1  1  2  1  2 NA  1  1  2  1  1  2  1  1
## [101]  2  1  2  1  2  2  1  2  1  1  2  1  2  1  1  2  2  2  1  2  1  2  1  1  2
## [126]  1  2  1  2  1  1  2  2  2  2  1  1  1  1  2  2  1  1  2  1  1  1  2  1  2
## [151]  2  1  1  2  2  1  1  2  2  1  1  2  2  1  1  1  2  1  2  1  1  2  1  1  1
## [176]  1  2  2  1  1  1 NA NA  2  1
depress |> 
  _[["gender"]]
##   [1]  1  2  2  1  2  1  1  2  2  2  1  1  2  2  1  2  1  1  2  1  2  1  2  1  2
##  [26]  2  1  2  1  2  1  1  2  1  2  2  1  2  1  2  1  1  1  2  2  1  2  2  2  1
##  [51] NA NA  1  1  2  1  1  2  2  1  1  1  2  1  1  2  2  1  1  1  2  2 NA  1  1
##  [76]  2  2  2  1  2  1  1  2  1  1  1  1  1  2  1  2 NA  1  1  2  1  1  2  1  1
## [101]  2  1  2  1  2  2  1  2  1  1  2  1  2  1  1  2  2  2  1  2  1  2  1  1  2
## [126]  1  2  1  2  1  1  2  2  2  2  1  1  1  1  2  2  1  1  2  1  1  1  2  1  2
## [151]  2  1  1  2  2  1  1  2  2  1  1  2  2  1  1  1  2  1  2  1  1  2  1  1  1
## [176]  1  2  2  1  1  1 NA NA  2  1
  1. 使用|>转换代码
quote(depress |> subset(class == 3) |> nrow())
## nrow(subset(depress, class == 3))

1.2 %>%的用法

  1. 常规用法,将⁠%>%左侧对象传递给右侧函数的第一个参数。
# 同pull(depress, gender)
depress %>% 
  pull(gender) 
##   [1]  1  2  2  1  2  1  1  2  2  2  1  1  2  2  1  2  1  1  2  1  2  1  2  1  2
##  [26]  2  1  2  1  2  1  1  2  1  2  2  1  2  1  2  1  1  1  2  2  1  2  2  2  1
##  [51] NA NA  1  1  2  1  1  2  2  1  1  1  2  1  1  2  2  1  1  1  2  2 NA  1  1
##  [76]  2  2  2  1  2  1  1  2  1  1  1  1  1  2  1  2 NA  1  1  2  1  1  2  1  1
## [101]  2  1  2  1  2  2  1  2  1  1  2  1  2  1  1  2  2  2  1  2  1  2  1  1  2
## [126]  1  2  1  2  1  1  2  2  2  2  1  1  1  1  2  2  1  1  2  1  1  1  2  1  2
## [151]  2  1  1  2  2  1  1  2  2  1  1  2  2  1  1  1  2  1  2  1  1  2  1  1  1
## [176]  1  2  2  1  1  1 NA NA  2  1
# 同nrow(subset(depress, gender == 0))
depress %>% 
  subset(gender == 0) %>% 
  nrow()  
## [1] 0
  1. %>%左侧对象传递给右侧函数的其他参数。
  1. 使用. 占位符指代%>%左侧对象
注记.占位符的使用规则
  1. .总是默认以内隐的方式传递给%>%右侧函数的第一个参数,这一行为可以用{}取消
  2. 可以使用位置传递.
  3. %>%右侧函数可以使用多个.
  4. 可以在嵌套函数中使用.
# 有效用法
depress %>%
  mutate(.,
         depr1_mean = rowMeans(select(., starts_with("depr1")))) %>%
  pull(depr1_mean)
##   [1] 1.75 2.30 1.50 1.80 2.15 1.65 2.25 2.45 1.80 1.50 1.95 2.10 2.45   NA 1.50
##  [16] 1.85 1.80 1.45 2.30 1.30 1.40 1.90 2.70 1.70 2.65 2.55 1.45 2.25 2.15 2.10
##  [31] 2.00 2.25   NA 2.20 2.45 2.40 2.05 1.65 1.60 1.35 1.75 1.85 2.00 1.50 2.20
##  [46] 1.75 2.20 1.35 2.40 1.95   NA   NA 1.70 1.20 1.65 2.15 2.20 2.35 2.05 1.95
##  [61] 1.85 2.15 2.15 1.60 2.05 2.25 1.55 1.75   NA 1.65 2.05   NA   NA 1.80 2.85
##  [76] 2.95 2.40 1.85 2.00 2.05 2.00 2.30 2.05 2.50 1.65 2.25 2.00 2.60 2.30   NA
##  [91] 2.00   NA 1.90   NA 1.55 1.65 1.65 2.00 2.20 1.55 1.55 2.90   NA 1.90 2.10
## [106] 1.35 1.50 2.35 1.75 1.30 1.95 1.60 1.45 2.70 1.75 2.10 1.80 1.90 1.75 1.60
## [121]   NA 1.85 1.65 1.90 1.80 1.50 1.55 1.80 1.70 1.65 1.85 1.35 1.90 2.45 1.50
## [136] 2.10   NA 2.15 2.95 1.25   NA 1.50 1.35 1.55 1.60   NA 1.15 1.95 1.90 1.65
## [151] 1.65 1.60 1.90 1.80 2.05 2.05 1.50 2.20 1.55 2.05 2.30   NA 2.45 1.35 2.25
## [166]   NA 2.00 2.15 1.45 2.75 2.35 1.80 1.65   NA 1.95 1.90 1.75 2.10 1.85 1.95
## [181] 1.70   NA   NA 1.45 1.25
# `.`总会传递给%>%右侧函数第一个参数
depress %>%
  slice_head(n = 5) %>%
  pull(depr1) %>%
  c(length(.))
## [1] 1.75 2.30 1.50 1.80 2.15 5.00
# 在`{}`内,`.`不会传递给第一个参数
depress %>%
  slice_head(n = 5) %>%
  pull(depr1) %>%
  {c(length(.))}
## [1] 5
  1. 使用匿名函数
depress %>%
  subset(class == 1) %>% 
  (function(d) t.test(depr1 ~ gender, data = d))
## 
##  Welch Two Sample t-test
## 
## data:  depr1 by gender
## t = -2.379, df = 41.406, p-value = 0.02205
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.44829941 -0.03670396
## sample estimates:
## mean in group 1 mean in group 2 
##        1.839583        2.082085
depress %>%
  subset(class == 2) %>% 
  t.test(depr1 ~ gender, data = .)
## 
##  Welch Two Sample t-test
## 
## data:  depr1 by gender
## t = -0.75249, df = 29.638, p-value = 0.4577
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.3367131  0.1554611
## sample estimates:
## mean in group 1 mean in group 2 
##        2.027231        2.117857
# 当`%>%`右侧有多个语句时,使用`{}`将这些语句括起来,
# 在`{}`中`.`不会传递给第一个参数
depress %>%
  subset(class == 3) %>% 
  {
    ncol(.)
    nrow(.)
  }
## [1] 53
  1. %>%结合使用.占位符提取数据子集
depress %>% 
  .$gender
##   [1]  1  2  2  1  2  1  1  2  2  2  1  1  2  2  1  2  1  1  2  1  2  1  2  1  2
##  [26]  2  1  2  1  2  1  1  2  1  2  2  1  2  1  2  1  1  1  2  2  1  2  2  2  1
##  [51] NA NA  1  1  2  1  1  2  2  1  1  1  2  1  1  2  2  1  1  1  2  2 NA  1  1
##  [76]  2  2  2  1  2  1  1  2  1  1  1  1  1  2  1  2 NA  1  1  2  1  1  2  1  1
## [101]  2  1  2  1  2  2  1  2  1  1  2  1  2  1  1  2  2  2  1  2  1  2  1  1  2
## [126]  1  2  1  2  1  1  2  2  2  2  1  1  1  1  2  2  1  1  2  1  1  1  2  1  2
## [151]  2  1  1  2  2  1  1  2  2  1  1  2  2  1  1  1  2  1  2  1  1  2  1  1  1
## [176]  1  2  2  1  1  1 NA NA  2  1
depress %>% 
  .[["gender"]]
##   [1]  1  2  2  1  2  1  1  2  2  2  1  1  2  2  1  2  1  1  2  1  2  1  2  1  2
##  [26]  2  1  2  1  2  1  1  2  1  2  2  1  2  1  2  1  1  1  2  2  1  2  2  2  1
##  [51] NA NA  1  1  2  1  1  2  2  1  1  1  2  1  1  2  2  1  1  1  2  2 NA  1  1
##  [76]  2  2  2  1  2  1  1  2  1  1  1  1  1  2  1  2 NA  1  1  2  1  1  2  1  1
## [101]  2  1  2  1  2  2  1  2  1  1  2  1  2  1  1  2  2  2  1  2  1  2  1  1  2
## [126]  1  2  1  2  1  1  2  2  2  2  1  1  1  1  2  2  1  1  2  1  1  1  2  1  2
## [151]  2  1  1  2  2  1  1  2  2  1  1  2  2  1  1  1  2  1  2  1  1  2  1  1  1
## [176]  1  2  2  1  1  1 NA NA  2  1
  1. 使用%>%.创建函数
quote(depress %>% subset(class == 3) %>% nrow())
## depress %>% subset(class == 3) %>% nrow()
# 当`.`出现在`%>%`左侧,它们将创建一个函数
subset_nrow <- . %>% subset(class == 3) %>% nrow()
subset_nrow(depress)
## [1] 53

2 c_across()

dplyr文档中介绍到,与c()相比,(1)c_across()使用tidy select,更加便捷; (2)c_across()使用vctrs::vec_c(),给出的输出更加安全。

tidy select只有15种(见help("select"))::!&c()everything()last_col()group_cols()starts_with()ends_with()contains()matches()num_range()all_of()any_of()where()

下面的例子在data.frame中对c()c_across()进行了比较,代码中注释了结果正确与否。

set.seed(20250927)
toy_dat <- data.frame(x1 = rnorm(10), x2 = rnorm(10), x3 = rnorm(10))
head(toy_dat)
##           x1         x2         x3
## 1 -2.7734224 -0.6661558  1.0882648
## 2 -1.1618233 -0.9076756  0.1483481
## 3 -0.3773254  0.1182323  1.0507643
## 4  0.2743375 -0.1521034  0.7412800
## 5  0.6070295  0.7295347 -0.5049492
## 6 -0.3429219 -0.1650072  2.3721901
toy_dat |>
  rowwise() |> 
  mutate(
    # correct
    y1 = mean(c(x1, x2, x3)),
    # incorrect
    y2 = mean(x1:x3),
    # incorrect
    y3 = mean(c(x1:x3)),
    # correct
    y4 = mean(c_across(x1:x3))
  ) |> 
  head()
## # A tibble: 6 × 7
## # Rowwise: 
##       x1     x2     x3     y1     y2     y3     y4
##    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 -2.77  -0.666  1.09  -0.784 -1.27  -1.27  -0.784
## 2 -1.16  -0.908  0.148 -0.640 -0.662 -0.662 -0.640
## 3 -0.377  0.118  1.05   0.264  0.123  0.123  0.264
## 4  0.274 -0.152  0.741  0.288  0.274  0.274  0.288
## 5  0.607  0.730 -0.505  0.277  0.107  0.107  0.277
## 6 -0.343 -0.165  2.37   0.621  0.657  0.657  0.621

下面的例子在tibble中对c()c_across()进行了比较,代码中注释了结果正确与否。

toy_dat <- as_tibble(toy_dat)
head(toy_dat)
## # A tibble: 6 × 3
##       x1     x2     x3
##    <dbl>  <dbl>  <dbl>
## 1 -2.77  -0.666  1.09 
## 2 -1.16  -0.908  0.148
## 3 -0.377  0.118  1.05 
## 4  0.274 -0.152  0.741
## 5  0.607  0.730 -0.505
## 6 -0.343 -0.165  2.37
toy_dat |>
  rowwise() |> 
  mutate(
    # correct
    y1 = mean(c(x1, x2, x3)),
    # incorrect
    y2 = mean(x1:x3),
    # incorrect
    y3 = mean(c(x1:x3)),
    # correct
    y4 = mean(c_across(x1:x3))
  ) |> 
  head()
## # A tibble: 6 × 7
## # Rowwise: 
##       x1     x2     x3     y1     y2     y3     y4
##    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 -2.77  -0.666  1.09  -0.784 -1.27  -1.27  -0.784
## 2 -1.16  -0.908  0.148 -0.640 -0.662 -0.662 -0.640
## 3 -0.377  0.118  1.05   0.264  0.123  0.123  0.264
## 4  0.274 -0.152  0.741  0.288  0.274  0.274  0.288
## 5  0.607  0.730 -0.505  0.277  0.107  0.107  0.277
## 6 -0.343 -0.165  2.37   0.621  0.657  0.657  0.621

可见,在使用rowwise()时,必须将tidy select语法与c_across()结合使用,才能得出正确的计算结果。