We discovered that vessel conversion factors had been applied incorrectly in past retrievals of the federal bottom trawl survey data that we obtained from the NEFSC. After resolving these problems, we set up this document to show whether any of the changes may have important implications for analyses that have recently been conducted using earlier datasets.
This report compares two datasets:
survdat_Nye_allseasons.RData
= 2019 Survdat
NEFSC_BTS_all_seasons_03032021.RData
= March 2021 Survdat
Load Packages Paths and Themes
###__ Packages ####
library(here)
library(janitor)
library(gmRi)
library(patchwork)
library(tidyverse)
library(knitr)
library(kableExtra)
# Set Paths to Box Resources
box_paths <- research_access_paths(os.use = "unix")
mills_path <- box_paths$mills
res_path <- box_paths$res
nmfs_path <- shared.path("unix", "RES_Data", "NMFS_Trawl")
#### Set theme ####
theme_set(theme_bw() + theme(
axis.text.y = element_text(size = 11),
axis.text.x = element_text(angle = 90, vjust = 0.5)))
Load SURVDAT Sources
# Cleanup functions
source(here("R/01_nefsc_ss_build_nodrop.R"))
# 2019 Data used in 2020 paper
load(paste0(nmfs_path, "Survdat_Nye_allseason.RData"))
survdat_nye <- survdat %>% clean_names()
# Data we just received in 2021 with errors located and corrected
load(paste0(nmfs_path, "2021_survdat/NEFSC_BTS_all_seasons_03032021.RData"))
survdat_21 <- survey$survdat %>% clean_names()
Run Standard Cleanup
# Run cleanup
survdat_nye <- survdat_prep_nodrop(survdat = survdat_nye) %>%
mutate(survdat_source = "survdat_nye")
# Run cleanup
survdat_21 <- survdat_prep_nodrop(survdat = survdat_21) %>%
mutate(survdat_source = "survdat_2021")
rm(survdat, survey)
Load List of Published Species
#### Load the species list from Andrew
species_check <- read_csv(here("data/andrew_species/Assesmentfishspecies.csv"),
col_types = cols())
species_check <- species_check %>%
clean_names() %>%
mutate(svspp = str_pad(svspp, 3, pad = "0", side = "left"),
comname = tolower(comname),
species = str_to_title(species)) %>%
arrange(svspp)
# Put in a list
source_list <- list("survdat_nye_2019" = survdat_nye,
"survdat_2021" = survdat_21)
In this code chunk the different sources are put together in a list, and the same cleanup steps are applied to each. The years are limited to before 2019 so there are no extra years in some not available in others. The SVSPP codes are also limited to just the ones in the list used by Andrew for Allyn et al. (2020).
After that, repeated records for abundance and biomass are dropped. These repeats exist to accommodate the length and number at length details. We don’t need them for this so they are dropped here.
# Make years comparable
# Filter species down for both
source_list <- map(source_list, function(survdat_data){
# filter years
sdat <- survdat_data %>%
filter(est_year >= 2008,
est_year <= 2017,
svspp %in% species_check$svspp)
# Pull distinct station abundance and biomass for each species
sdat <- sdat %>%
distinct(id, est_year, season, svvessel,
comname, svspp, catchsex, abundance, biom_adj) })
# Split each survdat set into a list of single species tables
source_splits <- map(source_list, function(source_data){
source_split <- source_data %>% split(.$comname) })
To make comparisons between the different data sets I first check that the svspp code is present in the dataset. Then for each source I get a total annual value for biomass and abundance. I then compute the percent difference in annual abundance/biomass for the two data sets. Then across data sources I calculate the correlation between these values, and the percent change from one data source to the most recently revised data pull. These results are shown in the last set of plots for some highlighted species (haddock, lobster) and for a few species with poor correlation or large percent change in abundance or biomass.
# Make Comparisons
# Vector of species and their common names, atleast the common names Andrew used
andrew_species <- species_check$comname %>% setNames(species_check$svspp)
# Pulling out details for the species
species_comparisons <- imap(andrew_species, function(species_comname, species_svspp){
# there are some name mismatches in common name, so catch those and return an error
# that explains which data it was not found in
in_nye <- species_comname %in% names(source_splits$survdat_nye_2019)
in_21 <- species_comname %in% names(source_splits$survdat_2021)
in_both <- in_nye & in_21
if(in_both == FALSE){
return(list("in_nye" = in_nye,
"in_21" = in_21,
"data" = data.frame(),
"metrics" = data.frame())) }
# Pull just that species from the 2019 data
summ_19 <- source_list$survdat_nye_2019 %>%
filter(svspp == species_svspp) %>%
group_by(comname, est_year) %>%
summarise(abund_19 = sum(abundance, na.rm = T),
biom_19 = sum(biom_adj, na.rm = T),
.groups = "keep") %>% ungroup()
# and 2021 data
summ_21 <- source_list$survdat_2021 %>%
filter(svspp == species_svspp) %>%
group_by(comname, est_year) %>%
summarise(abund_21 = sum(abundance, na.rm = T),
biom_21 = sum(biom_adj, na.rm = T),
.groups = "keep") %>% ungroup()
# join them for side-by-side dataframe
yr_df <- data.frame("est_year" = c("2008":"2017"))
combined_data <- yr_df %>%
left_join(summ_19, by = "est_year") %>%
left_join(summ_21, by = c("comname", "est_year")) %>%
mutate(
abund_change_19to21 = ((abund_21 - abund_19) / abund_19) * 100,
biom_change_19to21 = ((biom_21 - biom_19) / biom_19) * 100)
#return(combined_data)
# overall abundance correlation
abund_cor_19to21 <- cor(combined_data$abund_21,
combined_data$abund_19,
use = "pairwise.complete.obs")
# overall biomass correlation
biomass_cor_19to21 <- cor(combined_data$biom_21,
combined_data$biom_19,
use = "pairwise.complete.obs")
# overall average shifts
abund_shift_19to21 <- mean(combined_data$abund_change_19to21, na.rm = T)
biom_shift_19to21 <- mean(combined_data$biom_change_19to21, na.rm = T)
# put in list to export
list("data" = combined_data,
"metrics" = data.frame(
"comname" = species_comname,
"svspp" = species_svspp,
"abund_corr_19to21" = abund_cor_19to21,
"biom_corr_19to21" = biomass_cor_19to21,
"perc_abund_19to21" = abund_shift_19to21,
"perc_biom_19to21" = biom_shift_19to21)
)
})
For simplicity I then append the tables together so they are single data frames and not lists.
# put data and metrics into a table
comparison_data <- map_dfr(species_comparisons, ~.x[["data"]])
comparison_metrics <- map_dfr(species_comparisons, ~.x[["metrics"]])
As a quick visual reference, here is how total annual abundance and biomass compare across data sources. The species have been split into groups to make paneling less crowded.
# How many species are there, and how many can we panel well - 84
#unique(andrew_species)
# make even number groups
even_groups <- list(
group_1 = sort(andrew_species)[1:17],
group_2 = sort(andrew_species)[18:34],
group_3 = sort(andrew_species)[35:51],
group_4 = sort(andrew_species)[52:68],
group_5 = sort(andrew_species)[69:84])
Building Plots for Actual Abundance/Biomass
timeseries_plots <- map(even_groups, function(species_subset){
species_subset_dat <- comparison_data %>%
filter(comname %in% species_subset) %>%
mutate(comname = str_to_title(comname),
comname = fct_drop(comname),
comname = fct_inorder(comname))
# Abundance plot
abundance_plot <- ggplot(species_subset_dat, aes(est_year)) +
geom_jitter(aes(y = abund_19, color = "Survdat Nye Allseason 2019"),
size = 1, height = 0, width = 0.1) +
geom_jitter(aes(y = abund_21, color = "Newest Survdat"),
size = 1, height = 0, width = 0.1) +
geom_line(aes(y = abund_19,
color = "Survdat Nye Allseason 2019",
linetype = "Survdat Nye Allseason 2019"),
group = 1, alpha = 0.7) +
geom_line(aes(y = abund_21,
color = "Newest Survdat",
linetype = "Newest Survdat"),
group = 1, alpha = 0.7) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
facet_wrap(~comname, scales = "free") +
labs(x = "", y = "Annual Abundance",
color = "Survdat Retrieval",
linetype = "Survdat Retrieval") +
theme(legend.position = "none")
# Biomass plot
biomass_plot <- ggplot(species_subset_dat, aes(est_year)) +
geom_jitter(aes(y = biom_19, color = "Survdat Nye Allseason 2019"),
size = 1, height = 0, width = 0.1) +
geom_jitter(aes(y = biom_21, color = "Newest Survdat"),
size = 1, height = 0, width = 0.1) +
geom_line(aes(y = biom_19,
color = "Survdat Nye Allseason 2019",
linetype = "Survdat Nye Allseason 2019"),
group = 1, alpha = 0.7) +
geom_line(aes(y = biom_21,
color = "Newest Survdat",
linetype = "Newest Survdat"),
group = 1, alpha = 0.7) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
facet_wrap(~comname, scales = "free") +
labs(x = "", y = "Annual Biomass",
color = "Survdat Retrieval",
linetype = "Survdat Retrieval") +
theme(legend.position = "bottom")
return(list(a_plot = abundance_plot, b_plot = biomass_plot,
stacked = (abundance_plot / biomass_plot)))
})
Building Plots for Percent Change in Abundance/Biomass
# Build list of percent difference plots
timeseries_diff_plots <- map(even_groups, function(species_subset){
species_subset_dat <- comparison_data %>%
filter(comname %in% species_subset) %>%
mutate(comname = str_to_title(comname),
comname = fct_drop(comname),
comname = fct_inorder(comname))
# Abundance plot
abundance_plot <- ggplot(species_subset_dat, aes(est_year)) +
geom_jitter(aes(y = abund_change_19to21, color = "Abundance Change 2019 to 2021"),
size = 1, height = 0, width = 0.1) +
geom_line(aes(y = abund_change_19to21,
color = "Abundance Change 2019 to 2021",
linetype = "Abundance Change 2019 to 2021"),
group = 1, alpha = 0.7) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
facet_wrap(~comname, scales = "free") +
labs(x = "",
y = "Percent Difference in Annual Abundance",
color = "Survdat Retrieval Comparison",
linetype = "Survdat Retrieval Comparison") +
theme(legend.position = "none")
# Biomass Plot
biomass_plot <- ggplot(species_subset_dat, aes(est_year)) +
geom_jitter(aes(y = biom_change_19to21, color = "Biomass Change 2019 to 2021"),
size = 1, height = 0, width = 0.1) +
geom_line(aes(y = biom_change_19to21,
color = "Biomass Change 2019 to 2021",
linetype = "Biomass Change 2019 to 2021"),
group = 1, alpha = 0.7) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
facet_wrap(~comname, scales = "free") +
labs(x = "",
y = "Percent Difference in Annual Biomass",
color = "Survdat Retrieval Comparison",
linetype = "Survdat Retrieval Comparison") +
theme(legend.position = "bottom")
return(list(a_plot = abundance_plot, b_plot = biomass_plot,
stacked = (abundance_plot / biomass_plot)))
})
timeseries_plots$group_1$stacked
timeseries_diff_plots$group_1$stacked
timeseries_plots$group_2$stacked
#### Percent Differences
timeseries_diff_plots$group_2$stacked
timeseries_plots$group_3$stacked
#### Percent Differences
timeseries_diff_plots$group_3$stacked
timeseries_plots$group_4$stacked
#### Percent Differences
timeseries_diff_plots$group_4$stacked
timeseries_plots$group_5$stacked
#### Percent Differences
timeseries_diff_plots$group_5$stacked
haddock_data <- comparison_data %>%
filter(comname == "haddock")
ab_abund <- haddock_data %>%
ggplot(aes(est_year)) +
geom_line(aes(y = abund_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(y = abund_21, color = "Newest Survdat")) +
scale_color_gmri() +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
labs(x = "",
y = "Abundance",
subtitle = "Haddock Absolute Abundance", color = "") +
theme(legend.position = "bottom")
p_change <- haddock_data %>%
ggplot(aes(est_year, abund_change_19to21)) +
geom_line() +
scale_x_continuous(breaks = scales::pretty_breaks()) +
labs(y = "Percent Change in Abundance\n2019 -> 2021", x = "")
ab_abund / p_change
lob_data <- comparison_data %>%
filter(comname == "american lobster")
ab_abund <- lob_data %>%
ggplot(aes(est_year)) +
geom_line(aes(y = abund_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(y = abund_21, color = "Newest Survdat")) +
scale_color_gmri() +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
labs(x = "", y = "Abundance",
subtitle = "American Lobster Absolute Abundance", color = "") +
theme(legend.position = "bottom")
p_change <- lob_data %>%
ggplot(aes(est_year, abund_change_19to21)) +
geom_line() +
scale_x_continuous(breaks = scales::pretty_breaks()) +
labs(y = "Percent Change in Abundance\n2019 -> 2021", x = "")
ab_abund / p_change
# correlation
corr_cutoff <- 0.8
corr_species <- comparison_metrics %>%
filter(abund_corr_19to21 <= corr_cutoff) %>%
mutate(comname = fct_drop(comname)) %>%
pull(comname)
# filter species
corr_sub <- comparison_data %>%
filter(comname %in% corr_species)
# how many species per plot
p1 <- corr_species[1:6]
p2 <- corr_species[-c(1:6)]
# p1
corr_sub %>%
filter(comname %in% p1) %>%
mutate(comname = fct_drop(comname)) %>%
ggplot() +
geom_line(aes(est_year, abund_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(est_year, abund_21, color = "Newest Survdat")) +
facet_wrap(~comname, ncol = 1, scales = "free" ) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
labs(x = "",
y = "Annual Abundance",
title = paste0("Species with Annual Abundance Correlation <= ", corr_cutoff),
color = "Survdat Source",
subtitle = "Subset 1")
# p1
corr_sub %>%
filter(comname %in% p2) %>%
mutate(comname = fct_drop(comname)) %>%
ggplot() +
geom_line(aes(est_year, abund_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(est_year, abund_21, color = "Newest Survdat")) +
facet_wrap(~comname, ncol = 1, scales = "free" ) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
labs(x = "",
y = "Annual Abundance",
title = paste0("Species with Annual Abundance Correlation <= ", corr_cutoff),
color = "Survdat Source",
subtitle = "Subset 2")
# percent change
perc_cutoff <- 25
perc_species <- comparison_metrics %>%
filter(perc_abund_19to21 >= perc_cutoff) %>%
mutate(comname = fct_drop(comname)) %>%
pull(comname)
comparison_data %>%
filter(comname %in% perc_species) %>%
mutate(comname = fct_drop(comname)) %>%
ggplot() +
geom_line(aes(est_year, abund_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(est_year, abund_21, color = "Newest Survdat")) +
facet_wrap(~comname, ncol = 1, scales = "free") +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
labs(x = "",
y = "Annual Abundance",
title = paste0("Species with Avg. Shift in Abundance >= ", perc_cutoff, "%"),
color = "Survdat Source")
ab_biom <- haddock_data %>%
ggplot(aes(est_year)) +
geom_line(aes(y = biom_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(y = biom_21, color = "Newest Survdat")) +
scale_color_gmri() +
scale_y_continuous(labels = scales::comma_format()) +
labs(x = "", y = "Biomass", subtitle = "Haddock Absolute Biomass", color = "") +
theme(legend.position = "bottom")
p_change <- haddock_data %>%
ggplot(aes(est_year, biom_change_19to21)) +
geom_line() +
labs(y = "Percent Change in Biomass\n2019 -> 2021", x = "")
ab_biom / p_change
ab_biom <- lob_data %>%
ggplot(aes(est_year)) +
geom_line(aes(y = biom_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(y = biom_21, color = "Newest Survdat")) +
scale_color_gmri() +
scale_y_continuous(labels = scales::comma_format()) +
labs(x = "", y = "Biomass", subtitle = "American Lobster Absolute Biomass", color = "") +
theme(legend.position = "bottom")
p_change <- lob_data %>%
ggplot(aes(est_year, biom_change_19to21)) +
geom_line() +
labs(y = "Percent Change in Biomass\n2019 -> 2021", x = "")
ab_biom / p_change
# correlation
corr_cutoff <- 0.8
corr_species <- comparison_metrics %>%
filter(biom_corr_19to21 <= corr_cutoff) %>%
mutate(comname = fct_drop(comname)) %>%
pull(comname)
# plot
if(length(corr_species) > 0) {
comparison_data %>%
filter(comname %in% corr_species) %>%
mutate(comname = fct_drop(comname)) %>%
ggplot() +
geom_line(aes(est_year, biom_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(est_year, biom_21, color = "Newest Survdat")) +
facet_wrap(~comname, ncol = 1, scales = "free" ) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
labs(x = "",
y = "Annual Biomass",
title = paste0("Species with Annual Biomass Correlation <= ", corr_cutoff),
color = "Survdat Source")
}
# percent change
perc_cutoff <- 25
perc_species <- comparison_metrics %>%
filter(perc_biom_19to21 >= perc_cutoff) %>%
mutate(comname = fct_drop(comname)) %>%
pull(comname)
comparison_data %>%
filter(comname %in% perc_species) %>%
mutate(comname = fct_drop(comname)) %>%
ggplot() +
geom_line(aes(est_year, biom_19, color = "Survdat Nye Allseason 2019")) +
geom_line(aes(est_year, biom_21, color = "Newest Survdat")) +
facet_wrap(~comname, ncol = 1, scales = "free") +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_color_gmri() +
labs(x = "",
y = "Annual Biomass",
title = paste0("Species with Avg. Shift in Biomass >= ", perc_cutoff, "%"),
color = "Survdat Source")
A work by Adam A. Kemberling
Akemberling@gmri.org