Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ce4925f
add a receiving yardage case
akeaswaran Sep 3, 2025
8aff7ef
docs: Created return tag for internal functions for CRAN
saiemgilani Sep 4, 2025
6ad3e01
chore: Update drive and live play columns, fix tests
saiemgilani Sep 4, 2025
467e1b6
feat: Enhance cfbd_play_stats_player output and docs
saiemgilani Sep 4, 2025
8953e57
fix: Substitute timeouts in cfbd_pbp_data when missing
saiemgilani Sep 4, 2025
61120cf
fix: Specify .groups argument in summarise call
saiemgilani Sep 4, 2025
85f40d5
docs: Update cfbd_drives return documentation
saiemgilani Sep 4, 2025
31d0871
fix: use `dplyr::distinct()` over `dplyr::distinct_all()` spacing and…
saiemgilani Sep 4, 2025
a78c607
add more cases
akeaswaran Sep 15, 2025
221e0f7
Merge branch 'main' into AE/2025-receiving-yardage-parse-case
akeaswaran Sep 15, 2025
9404f5b
not sure these are necessary but sure
akeaswaran Sep 15, 2025
69b981c
matching parse logic
akeaswaran Sep 16, 2025
33f86b4
chore: Fix argument order in expect_in for scoreboard tests
saiemgilani Oct 13, 2025
142a7a0
fix: Skip games with insufficient plays in play-by-play data
saiemgilani Oct 13, 2025
dd8435a
chore: Bump version to 2.1.0 and update release notes
saiemgilani Oct 13, 2025
fad6fde
chore: Normalize column names and update tests
saiemgilani Oct 13, 2025
a5a83fb
Merge branch 'main' into AE/2025-receiving-yardage-parse-case
akeaswaran Jan 4, 2026
a4ff4e1
cleaning up play text before parsing
akeaswaran Jan 4, 2026
e745d39
run and rush cases separated
akeaswaran Jan 4, 2026
4a58455
add prereq method just in case
akeaswaran Jan 4, 2026
f9704f9
fix tests
akeaswaran Jan 4, 2026
08bfb33
chore: Update default season_type and week validation
saiemgilani Jan 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ Suggests:
DBI,
ggplot2,
ggrepel,
patrick,
qs (>= 0.25.1),
rmarkdown,
RSQLite,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ export(cfbd_venues)
export(clean_drive_dat)
export(clean_drive_info)
export(clean_pbp_dat)
export(clean_play_text)
export(create_epa)
export(create_wpa_naive)
export(epa_fg_probs)
Expand Down Expand Up @@ -148,6 +149,7 @@ importFrom(stringr,str_detect)
importFrom(stringr,str_extract)
importFrom(stringr,str_length)
importFrom(stringr,str_remove)
importFrom(stringr,str_replace)
importFrom(stringr,str_replace_all)
importFrom(stringr,str_sub)
importFrom(stringr,str_trim)
Expand Down
35 changes: 33 additions & 2 deletions R/cfbd_pbp_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,7 @@ cfbd_pbp_data <- function(year,
return(NULL)
}
game_plays <- game_plays %>%
clean_play_text() %>%
penalty_detection() %>%
add_play_counts() %>%
clean_pbp_dat() %>%
Expand Down Expand Up @@ -801,8 +802,6 @@ cfbd_pbp_data <- function(year,
return(play_df)
}



#' **Series of functions to help clean the play-by-play data for analysis**
#' @name helpers_pbp
NULL
Expand Down Expand Up @@ -2146,3 +2145,35 @@ clean_drive_info <- function(drive_df) {

return(clean_drive)
}


#' @rdname helpers_pbp
#'
#' @param play_df (*data.frame* required) Plays dataframe pulled from API via the `cfbd_play()` or within the `cfbd_pbp_data()` function.
#' @details Cleans CFB play-by-play text to be compliant with existing play-by-play parsing. Generally not recommended for standalone use. This method exists due to ESPN PBP changes midway through the 2025 season.
#' \describe{
#' \item{`play_text`: Returned as `play_text`}{.}
#' }
#' @return The original `play_df` with the following columns appended to it:
#' \describe{
#' \item{`cleaned_text`: `play_text` with miscellanous items removed: pass depth/location, clock timestamps, No Huddle/Shotgun status, etc.}{.}
#' }
#' @keywords internal
#' @importFrom rlang .data
#' @importFrom stringr str_replace
#' @importFrom dplyr mutate
#' @export
#'

clean_play_text <- function(play_df) {
play_df <- play_df %>%
dplyr::mutate(
cleaned_text = stringr::str_replace(.data$play_text, "^\\(\\d{1,2}:\\d{2}\\)\\s+", ""),
cleaned_text = stringr::str_replace(.data$cleaned_text, "\\s(short|deep)\\s", " "),
cleaned_text = stringr::str_replace(.data$cleaned_text, "\\s(left|middle|right)\\s", " "),
cleaned_text = stringr::str_replace(.data$cleaned_text, "\\s*No Huddle-Shotgun\\s+", ""),
cleaned_text = stringr::str_replace(.data$cleaned_text, "No Huddle-?", ""),
cleaned_text = stringr::str_replace(.data$cleaned_text, "\\s*Shotgun\\s+", ""),
cleaned_text = stringr::str_replace(.data$cleaned_text, "\\s+", " "),
)
}
87 changes: 49 additions & 38 deletions R/helper_pbp_add_yardage.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,88 +55,99 @@ add_yardage <- function(play_df) {
play_df <- play_df %>%
dplyr::mutate(
yds_rushed = dplyr::case_when(
.data$rush == 1 & stringr::str_detect(.data$play_text, regex("run for no gain", ignore_case = TRUE)) ~ 0,
.data$rush == 1 & stringr::str_detect(.data$cleaned_text, regex("run|rush for no gain", ignore_case = TRUE)) ~ 0,
.data$rush == 1 &
stringr::str_detect(.data$play_text, regex("run for a loss of", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("run|rush for a loss of", ignore_case = TRUE)) ~
-1 * as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= run for a loss of)[^,]+"), "\\d+"
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<= run|rush for a loss of)[^,]+"), "\\d+"
)),
.data$rush == 1 &
stringr::str_detect(.data$play_text, regex("run for", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("run|rush for \\d+ y.*ds? loss", ignore_case = TRUE)) ~
-1 * as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<= run|rush for)[^,]+"), "\\d+"
)),
.data$rush == 1 &
stringr::str_detect(.data$cleaned_text, regex("run|rush for", ignore_case = TRUE)) ~
as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= run for)[^,]+"), "\\d+"
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<= run|rush for)[^,]+"), "\\d+"
)),
.data$rush == 1 &
stringr::str_detect(.data$play_text, regex("yd run", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("yd run|rush", ignore_case = TRUE)) ~
as.numeric(
stringr::str_remove(
stringr::str_extract(.data$play_text, regex("\\d{0,2} Yd Run", ignore_case = TRUE)),
regex("yd run", ignore_case = TRUE)
stringr::str_extract(.data$cleaned_text, regex("\\d{0,2} Yd Run|Rush", ignore_case = TRUE)),
regex("yd run|rush", ignore_case = TRUE)
)
),
TRUE ~ NA_real_
),
yds_receiving = dplyr::case_when(
.data$pass == 1 & stringr::str_detect(.data$play_text, regex("pass complete to", ignore_case = TRUE)) &
stringr::str_detect(.data$play_text, regex("for no gain", ignore_case = TRUE)) ~ 0,
.data$pass == 1 & stringr::str_detect(.data$cleaned_text, regex("pass complete to", ignore_case = TRUE)) &
stringr::str_detect(.data$cleaned_text, regex("for no gain", ignore_case = TRUE)) ~ 0,
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("pass complete to", ignore_case = TRUE)) &
stringr::str_detect(.data$play_text, regex("for a loss of", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("pass complete to", ignore_case = TRUE)) &
stringr::str_detect(.data$cleaned_text, regex("for a loss of", ignore_case = TRUE)) ~
-1 * as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= for a loss of)[^,]+"), "\\d+"
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for a loss of)[^,]+"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("pass to", ignore_case = TRUE)) &
stringr::str_detect(.data$play_text, regex("for a loss of", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("pass to", ignore_case = TRUE)) &
stringr::str_detect(.data$cleaned_text, regex("for a loss of", ignore_case = TRUE)) ~
-1 * as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= for a loss of)[^,]+"), "\\d+"
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for a loss of)[^,]+"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$cleaned_text, regex("pass complete to", ignore_case = TRUE)) &
stringr::str_detect(.data$cleaned_text, regex(" for \\d+ y\\w*ds? loss", ignore_case = TRUE)) ~
-1 * as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for)[^,]+"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("pass complete to", ignore_case = TRUE)) &
stringr::str_detect(.data$play_text, regex(" for \\d+ y\\w*ds?", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("pass complete to", ignore_case = TRUE)) &
stringr::str_detect(.data$cleaned_text, regex(" for \\d+ y\\w*ds?", ignore_case = TRUE)) ~
as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= for)[^,]+"), "\\d+"
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for)[^,]+"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("pass to", ignore_case = TRUE)) &
stringr::str_detect(.data$play_text, regex(" for \\d+ y\\w*ds?", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("pass to", ignore_case = TRUE)) &
stringr::str_detect(.data$cleaned_text, regex(" for \\d+ y\\w*ds?", ignore_case = TRUE)) ~
as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= for)[^,]+"), "\\d+"
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for)[^,]+"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("Yd pass", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("Yd pass", ignore_case = TRUE)) ~
as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(\\d+)\\s+Yd\\s+pass"), "\\d+"
stringi::stri_extract_first_regex(.data$cleaned_text, "(\\d+)\\s+Yd\\s+pass"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("pass complete to", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("pass complete to", ignore_case = TRUE)) ~
yards_gained, # 2024 has games that don't have yards in the PBP text but do have them in the yards_gained field.

# 2025 has some plays list "PASSER pass" at the very end of the play_text
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("pass \\(\\w", ignore_case = TRUE)) &
stringr::str_detect(.data$play_text, regex("^to ", ignore_case = FALSE)) ~ as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= for)[^,]+"), "\\d+"
stringr::str_detect(.data$cleaned_text, regex("pass \\(\\w", ignore_case = TRUE)) &
stringr::str_detect(.data$cleaned_text, regex("^to ", ignore_case = FALSE)) ~ as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for)[^,]+"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("pass$", ignore_case = TRUE)) &
stringr::str_detect(.data$play_text, regex("^to ", ignore_case = FALSE)) ~ as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= for)[^,]+"), "\\d+"
stringr::str_detect(.data$cleaned_text, regex("pass$", ignore_case = TRUE)) &
stringr::str_detect(.data$cleaned_text, regex("^to ", ignore_case = FALSE)) ~ as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for)[^,]+"), "\\d+"
)),
# 2025 has some plays that have yards in the PBP but no listed passer. the format is the same though
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("^to ", ignore_case = FALSE)) ~ as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= for)[^,]+"), "\\d+"
stringr::str_detect(.data$cleaned_text, regex("^to ", ignore_case = FALSE)) ~ as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for)[^,]+"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("^to ", ignore_case = FALSE)) &
stringr::str_detect(.data$play_text, regex("for a loss of", ignore_case = TRUE)) ~
stringr::str_detect(.data$cleaned_text, regex("^to ", ignore_case = FALSE)) &
stringr::str_detect(.data$cleaned_text, regex("for a loss of", ignore_case = TRUE)) ~
-1 * as.numeric(stringr::str_extract(
stringi::stri_extract_first_regex(.data$play_text, "(?<= for a loss of)[^,]+"), "\\d+"
stringi::stri_extract_first_regex(.data$cleaned_text, "(?<=[\\s,]for a loss of)[^,]+"), "\\d+"
)),
.data$pass == 1 &
stringr::str_detect(.data$play_text, regex("^to ", ignore_case = FALSE)) &
stringr::str_detect(.data$play_text, regex("for no gain", ignore_case = TRUE)) ~ 0,
stringr::str_detect(.data$cleaned_text, regex("^to ", ignore_case = FALSE)) &
stringr::str_detect(.data$cleaned_text, regex("for no gain", ignore_case = TRUE)) ~ 0,
TRUE ~ NA_real_
)
)
Expand Down
42 changes: 21 additions & 21 deletions cfbfastR.Rproj
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
13 changes: 13 additions & 0 deletions man/helpers_pbp.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions tests/testthat/test-cfbd_pbp_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,36 @@ test_that("base case 2023 pbp are already properly handled", {

testthat::expect_equal(sum(completions$same_same), nrow(completions))
})


patrick::with_parameters_test_that(
"[2025 new PBP] Yardage is successfully calculated",
{
skip_on_cran()
plays = cfbd_pbp_data(
year = year,
season_type = season_type,
week = week,
team = team,
epa_wpa = T,
)

target_plays = plays[which(plays$play_text == play_text), ]
testthat::expect_equal(nrow(target_plays), 1)
testthat::expect_equal(target_plays[1, yards_field][[1]], expected_yards)
},
patrick::cases(
"401754571-yds_receiving-1" = list(year = 2025, season_type = "regular", week = 9, team = "Georgia Tech", play_text = "(14:46) Shotgun #10 H.King pass complete short right to #1 J.Haynes caught at GT27, for 15 yards to the GT40 (#13 G.Bryant III), 1ST DOWN", yards_field = "yds_receiving", expected_yards = 15),
"401754571-yds_receiving-2" = list(year = 2025, season_type = "regular", week = 9, team = "Georgia Tech", play_text = "(14:17) No Huddle-Shotgun #10 H.King pass complete short right to #4 I.Canion caught at GT46, for 2 yards to the GT42 fumbled by #4 I.Canion at GT46 forced by #16 C.Peal recovered by SU #8 D.Reese at GT42, End Of Play", yards_field = "yds_receiving", expected_yards = 2),
"401754571-yds_receiving-3" = list(year = 2025, season_type = "regular", week = 9, team = "Georgia Tech", play_text = "(06:15) Shotgun #10 H.King pass incomplete short left to #17 J.Beetham thrown to SU01", yards_field = "yds_receiving", expected_yards = NA_integer_),
"401754571-yds_rushed-1" = list(year = 2025, season_type = "regular", week = 9, team = "Georgia Tech", play_text = "(13:31) Shotgun #10 H.King rush right for 7 yards gain to the SU30, out of bounds at SU30, 1ST DOWN", yards_field = "yds_rushed", expected_yards = 7),
"401754571-yds_rushed-2" = list(year = 2025, season_type = "regular", week = 9, team = "Georgia Tech", play_text = "(07:16) No Huddle-Shotgun #1 J.Haynes rush left for 4 yards loss to the SU35 (#6 J.Heard Jr.; #3 K.Singleton)", yards_field = "yds_rushed", expected_yards = -4),
"401754571-yds_receiving-4" = list(year = 2025, season_type = "regular", week = 9, team = "Syracuse", play_text = "(15:00) No Huddle-Shotgun #10 R.Collins pass complete deep right to #2 J.Cook II caught at GT37, for 41 yards to the GT34 (#6 R.Shelley), 1ST DOWN", yards_field = "yds_receiving", expected_yards = 41),
"401754571-yds_receiving-5" = list(year = 2025, season_type = "regular", week = 9, team = "Syracuse", play_text = "(15:00) No Huddle-Shotgun #10 R.Collins pass complete deep right to #2 J.Cook II caught at GT37, for 41 yards to the GT34 (#6 R.Shelley), 1ST DOWN", yards_field = "yds_receiving", expected_yards = 41),
"401754571-yds_receiving-6" = list(year = 2025, season_type = "regular", week = 9, team = "Syracuse", play_text = "(09:25) No Huddle-Shotgun #10 R.Collins pass complete short left to #2 J.Cook II caught at SU31, for 4 yards to the SU34 (#2 E.Lightsey)", yards_field = "yds_receiving", expected_yards = 4),
"401754571-yds_receiving-7" = list(year = 2025, season_type = "regular", week = 9, team = "Georgia Tech", play_text = "(05:49) Shotgun #10 H.King pass complete short middle to #85 J.Allen caught at SU33, for 19 yards to the SU09 (#0 B.Long Jr.)", yards_field = "yds_receiving", expected_yards = 19),
"401777353-yds_receiving-1" = list(year = 2025, season_type = "regular", week = 15, team = "Ohio State", play_text = "(07:37) Shotgun #10 J.Sayin pass complete short left to #4 J.Smith caught at OSU29, for 5 yards loss to the OSU32 (#12 D.Boykin)", yards_field = "yds_receiving", expected_yards = -5),
"401778302-yds_receiving-1" = list(year = 2025, season_type = "postseason", week = 1, team = "Boise State", play_text = "Shotgun #14 M.Cutforth pass complete deep middle to #3 L.Caples caught at WAS06, for 22 yards to the WAS06 (#18 R.Dillard-Allen), 1ST DOWN", yards_field = "yds_receiving", expected_yards = 22),
"401634169-base-case-old-pbp" = list(year = 2024, season_type = "regular", week = 1, team = "Purdue", play_text = "Hudson Card pass complete to Drew Biber for 2 yds fumbled, forced by Maddix Blackwell, recovered by INST Garret Ollendieck G. Ollendieck return for 0 yds", yards_field = "yds_receiving", expected_yards = 2)
)
)
Loading