Haddock Check

When digging into the data for Haddock it appeared that the biomass for haddock in the Henry Bigelow Era, as derived from length-weight relationships was distinctly higher than for in the spring as well as during the Albatross years.

What gives?

Load survdat File

I added a couple options here to select different pulls of survdat since we were seeing some issues in some of them.

The selected survdat pull is 2021

# Use parameter to select source:
survdat_path <- switch(
  EXPR = params$survdat_source, 
  "2016" = paste0(mills_path, "Projects/WARMEM/Old survey data/Survdat_Nye2016.RData"),
  "2020" = paste0(res_path, "NMFS_trawl/prior_survdat_data/Survdat_Nye_Aug 2020.RData"),
  "2021" = paste0(res_path, "NMFS_trawl/prior_survdat_data/survdat_slucey_01152021.RData"),
  "Feb 23 2021" = paste0(res_path, "NMFS_trawl/2021_survdat/NEFSC_BTS_02242021.RData")
  )
                       


# Load any* of the survdat files
load(survdat_path)

# February 2021 following DBI package fix
if(params$survdat_source == "Feb 23 2021"){
  survdat <- survey$survdat 
  rm(survey)}

# clean the names
trawldat <- survdat %>% 
    clean_names()




# Bonus steps for the 2016 and 2021 datasets that don't have common name field
if(params$survdat_source %in% c("2021", "Feb 23 2021")){
  trawldat <- trawldat %>% 
    mutate(svspp = as.character(svspp),
           svspp = str_pad(svspp, 3, "left", "0"))
  
  # Extra prep for 2016/slucey data, getting species names via svspp codes
  spp_classes <- read_csv(paste0(res_path, "NMFS_trawl/spp_keys/sppclass.csv"),
                          col_types = cols()) %>%
    clean_names() %>%
    mutate(common_name = str_to_lower(common_name),
           scientific_name = str_to_lower(scientific_name)) %>%
    distinct(svspp, comname = common_name, scientific_name)
  
  
  # Add the common names over and format for rest of build
  trawldat <- left_join(trawldat, clean_names(spp_classes)) %>%
    drop_na(comname) %>%
    mutate(cruise6 = str_pad(cruise6, 6, "left", "0"),
           station = str_pad(station, 3, "left", "0"),
           stratum = str_pad(stratum, 4, "left", "0"),
           id = str_c(cruise6, station, stratum)) %>%
    select(id, est_year = year, station, stratum, svvessel, season, lat, lon, depth,
           surftemp, surfsalin, bottemp, botsalin, svspp, comname, scientific_name, everything())
}

Clean up Dataset

# Do normal cleanup steps
trawldat <- trawldat %>% 
  mutate(
    # Text Formatting 
    comname = tolower(comname),
    id      = format(id, scientific = FALSE),
    svspp   = as.character(svspp),
    svspp   = str_pad(svspp, 3, "left", "0"),
    
    # Biomass and abundance NA substitutions - flag these events
    no_bio    = ifelse(is.na(biomass) == TRUE & abundance > 0, "bio_flag", "bio_ok"),
    biomass   = ifelse(is.na(biomass) == TRUE & abundance > 0, 0.0001, biomass),
    
    
    # # Checking if error is caused by this adjustment: verdict - not the issue
    no_abund = ifelse(is.na(abundance) == TRUE & biomass > 0, "abund_flag", "anund_ok"),
    abundance = ifelse(is.na(abundance) == TRUE & biomass > 0, 1, abundance),
    
    # Sratum number, excluding leading and trailing codes for inshore/offshore, for matching
    strat_num = str_sub(stratum, 2, 3)) %>%  
  
  # Bump biomass/abundance above 0 if there is a complementary measurement
  # Shouldn't have biomass and no fish etc.
  mutate(biom_adj  = ifelse(biomass == 0 & abundance > 0, 0.0001, biomass), 
         .after = biomass) %>% 
  mutate(abund_adj = ifelse(abundance == 0 & biomass > 0, 1, abundance), 
         .after = abundance)
  


# # Checking if error is caused by this adjustment: verdict - not the issue
# mutate(abund_adj = abundance, .after = abundance)




# Filter Data Down to isolate issues
haddock <- trawldat %>% 
  filter(
    # Filter to just Spring and Fall
    season %in% c("SPRING", "FALL"),
         
    # Filter to only the Albatross and Henry Bigelow Vessels
    svvessel %in% c("AL", "HB"),
    
    # Choose manageable time frame
    est_year >= 2000,
    est_year <= 2020, 
    
    # Pull out Haddock
    comname == "haddock")

Adjust Numbers at Length

Sometimes in the data there are situations where the number recorded for a given species’ “abundance” does not match the amount of fish you get when tallying up the numbers of each length.

To account for this mismatch we use a conversion factor to scale up/down how many of each size class there would have needed to be to have that tally match up.

# Get the abundance value for each sex arrived at by summing across each length
# sometimes there are more measured than initially tallied*
abundance_check <- haddock %>%
  group_by(id, comname, catchsex, abundance) %>%
  summarise(
    numlen_abund = sum(numlen),                
    n_len_class  = n_distinct(length), # number of distinct lengths caught that station
    .groups      = "keep") %>% 
  ungroup()

# Conversion factor translates each size at length to what it would need be to be
# for them to add up to that "abundance" column
conv_factor <- haddock %>% 
  distinct(id, comname, catchsex, length, abund_adj) %>% 
  inner_join(abundance_check) %>% 
  mutate(convers = abund_adj / numlen_abund)
  


# Merge back and convert the numlen field
haddock_clean <- haddock %>%
  left_join(conv_factor) %>%
  mutate(numlen_adj = numlen * convers, .after = numlen) # %>% 
  #select(-c(numlen_abund, convers))

Match to Growth Coefficients

# This table is a combined table of wigley and fishbase L-W coefficients
lw_combined <- read_csv(here::here("data/biomass_key_combined.csv"), col_types = cols()) %>% 
  mutate(svspp = str_pad(svspp, 3, "left", "0"))


# Just use the Wigley Paper Coefficients
wigley_coefficients <- filter(lw_combined, source == "wigley") %>% 
  select(source, season, svspp, comname, scientific_name, spec_class, 
         hare_group, fishery, catchsex, a, b, ln_a)

# merge on comname, season, and catchsex
haddock_lw <- left_join(haddock_clean, wigley_coefficients)

# Calculate the LW Biomasses for given lengths
haddock_lw <- haddock_lw %>% 
  mutate(
    llen          = log(length),                # Self explanatory
    ind_log_wt    = ln_a + (b * llen),          # log(weight) of an individual at that length
    ind_weight_kg = exp(ind_log_wt),            # Weight of an individual in size class
    sum_weight_unadj = ind_weight_kg * numlen,  # unadjusted
    sum_weight_kg = ind_weight_kg * numlen_adj, # Weight of all individuals of size class
    # Adjustment for how BIOMASS is repeated, 
    # puts it in equal amounts for each weight class so we can sum easier
    biom_per_lclass = (biom_adj / n_len_class)  
  )

Look at Differeces Between BIOMASS and LW Biomass

For any given station across all the different survey seasons and and for both vessels you would expect there to be a very tight 1:1 relationship between the weights expected from haddock’s length weight relationship and the actual biomass weights of weighing all the fish together.

What seems really odd is that this is true across all groups, except for data collected on the Bigelow during Fall. For this group a tight linear relationship exists, but it shows a much steeper relationship.

# Checking one species at a time
station_summs <- haddock_lw %>% 
  group_by(id, est_year, season, svvessel, catchsex) %>% 
  summarise(lw_bio = sum(sum_weight_kg),
            lw_unadj = sum(sum_weight_unadj),
            biomass_col = sum(biom_per_lclass))

Biomass & Adjusted Numlen

# Plot against numlen_adj
station_summs %>%   
  ggplot(aes(biomass_col, lw_bio, 
             color = svvessel)) +
  geom_point(show.legend = F) +
  facet_grid(season~svvessel) +
  geom_abline(slope = 1, intercept = 0, linetype = 2, color  = "darkblue") +
  scale_color_gmri() +
  labs(x = "Weight from survdat$BIOMASS", y = "Weight from LW relationship",
       caption = "Dashed line depicts 1:1 relationship",
       subtitle = "LW Biomass using numlen_adj")

Biomass & Numlen

# if BIOMASS not been adjusted in FALL, then you'd see mismatches across other seasons 
station_summs %>%   
  ggplot(aes(biomass_col, lw_unadj, 
             color = svvessel)) +
  geom_point(show.legend = F) +
  facet_grid(season~svvessel) +
  geom_abline(slope = 1, intercept = 0, linetype = 2, color  = "darkblue") +
  scale_color_gmri() +
  labs(x = "Weight from survdat$BIOMASS", y = "Weight from LW relationship",
       caption = "Dashed line depicts 1:1 relationship",
       subtitle = "LW Biomass using numlen | un-adjusted")

BIOMASS & Abundance

Its possible that abundance and biomass columns have become disconnected in the Fall Bigelow surveys for some reason whether it is from subsampling or some other reason. To make sure that is not the case we can plot these relationships and try to dig into them that way.

This will be tricky because it depends on the average size of fish.

haddock_lw %>% 
  ggplot(aes(abundance, biomass, color = svvessel)) +
  geom_point(show.legend = F) +
  facet_grid(season~svvessel) +
  scale_color_gmri() +
  scale_x_continuous(labels = scales::comma_format()) +
  labs(x = "Station Abundance from survdat$abundance", 
       y = "Biomass weight from survdat$biomass",
       subtitle = "LW Biomass using numlen | un-adjusted")

Fall Bigelow Inspection

So it seems to me that something odd is happening with the data coming from the fall survey in the bigelow era.

The following plot looks at the relationship between the average weight per fish, and how that changes with the average size for a fish species.

If adjustments to BIOMASS and/or ABUNDANCE occur, they should occur in a way that they adjust together and preserve this relationship.

fall_inspection <- haddock_lw %>% 
  #filter(svvessel == "HB" & season == "FALL") %>% 
  group_by(id, svvessel, season, abundance, biomass) %>% 
  summarise(
    bio_per_fish = mean(biomass) / mean(abundance),
    wtd_mean_len = weighted.mean(length, numlen),
    wtd_mean_len_adj = weighted.mean(length, numlen_adj)) %>% 
  ungroup() 


fall_inspection %>% 
  ggplot(aes(wtd_mean_len, bio_per_fish, color = wtd_mean_len)) +
  geom_point(show.legend = F) +
  facet_grid(season~svvessel) +
  scale_color_gmri(discrete = F) +
  scale_x_continuous(labels = scales::comma_format()) +
  labs(x = "Average Fish Length (cm)", 
       y = "Average Fish Weight \n survdat$biomass / survdat$abundance")

And just to verify that the coefficients are consistent for Fall, just note that Fall coefficients are the same for henry bigelow and albatross stations. So either abundance or biomass are no longer in sync, or there is a different cohort with different growth relationships.

haddock_lw %>% 
  distinct(svvessel, season, a, b) %>% 
  kable() %>% 
  kable_styling()

svvessel	season	a	b
AL	SPRING	7.5e-06	3.0766
AL	FALL	7.4e-06	3.0888
HB	SPRING	7.5e-06	3.0766
HB	FALL	7.4e-06	3.0888

Exploring Different Causes:

The following sections are my efforts digging into the different mechanisms that may have possibly caused the patterns seen above.

Is it an issue with the length classes?

Its possible that I introduced some matching issues by trying to cheat my way through with that biomass per length class column.

This would create a duplication of catches in certain cases for the totaling of just the length weight weights. To check this I can double-check that only unique station-species-sex-length groups are being counted.

# Pull distinct records of each stations BIOMASS for haddock
BIOMASS <- haddock_lw %>% 
  distinct(id, est_year, season, svvessel, catchsex, abund_adj, biom_adj) %>% 
  rename(biomass = biom_adj)

# Use distinct to drop any duplicate stations, grab all lw weights
LW <- haddock_lw %>% 
  distinct(id, est_year, season, svvessel, catchsex, numlen_adj, abund_adj, sum_weight_kg) %>% 
  group_by(id, est_year, season, svvessel, catchsex, abund_adj) %>% 
  summarise(lw_biomass = sum(sum_weight_kg))

# add them together, should merge seemlessly as station totals
dup_id_test <- left_join(BIOMASS, LW)

# plot
ggplot(dup_id_test,
       aes(biomass, lw_biomass, 
             color = svvessel)) +
  geom_point(show.legend = F) +
  facet_grid(season~svvessel) +
  scale_color_gmri() +
  geom_abline(slope = 1, intercept = 0, linetype = 2, color  = "darkblue") +
  labs(x = "Weight from survdat$BIOMASS", y = "Weight from LW relationship",
       caption = "Dashed line depicts 1:1 relationship")

Anything remarkable happening with lengths?

Another possible problem could relate to the fit of the growth curve. If there is a poor fit at some end of the curve, then its possible that having a bunch of catch in that size range would impact fit. To check this we can simply plot the length frequencies, looking to see if there is some special signature visible in Fall.

haddock_lw %>% 
  ggplot() + 
  geom_histogram(aes(x = length, fill = svvessel)) + #un-weighted
  geom_histogram(aes(x = length, weight = numlen_adj, fill = svvessel)) + #weighted
  facet_grid(season~svvessel) +
  scale_fill_gmri() +
  labs(x = "Length (cm)", y = "Frequency - Weighted By Numlen")

Checking Abundance Numbers

There may be some catch density issues for this species. The catch in Fall since 2008 is the highest it has ever been, so perhaps catch densities may be the problem.

haddock_lw %>% 
  group_by(est_year, svvessel, season) %>% 
  summarise(total_fish = sum(numlen_adj)) %>% 
  ggplot(aes(est_year, total_fish, color = svvessel)) +
  geom_line(show.legend = F) +
  scale_color_gmri() +
  facet_wrap(~season) +
  labs(x = "", y = "Total Haddock Caught in Survey")

NA Replacement Issues?

One thing that could be happening is that in the Fall, when there are astronomically high numbers, it is not possible to get a total biomass value.

In these events we would still have a size distribution and weights for individuals that are taken as a subsample. This would contribute to length weight biomass, where there is a 0 substituted as total biomass.

NA BIOMASS

Plotting the what the original value of BIOMASS is for each length weight total. If there was one without the other it should be flagged.

# check instances of NA biomass
haddock_lw %>% 
  ggplot(aes(est_year, sum_weight_kg, color = no_bio)) +
  geom_point() +
  scale_color_gmri() +
  labs(x = "", y = "Length Weight Biomass", color = "BIOMASS Check")

NA Abundance

Plotting the what the original value of ABUNDANCE is for each length weight total. If there was one without the other it should be flagged.

# check instances of NA abundance
haddock_lw %>% 
  ggplot(aes(est_year, abundance, color = no_abund)) +
  geom_point() +
  scale_color_gmri() +
  labs(x = "")

Numlen Adjusted Problems

As part of the cleanup process we create that numlen_adj column to account for the mismatch between the number of fish measured and the abundance recorded. This often scales up the numlen column and would increase the length weight biomasses.

It appears from plotting these factors that for Spring, and Fall for the Albatross this value is around 1 and imparts a minor adjustment. In contrast, fall henry bigelow factors are closer to 4.

haddock_lw %>% 
  ggplot(aes(x = convers, color = season)) +
  geom_boxplot() +
  scale_color_gmri() +
  facet_grid(season + svvessel ~  .) +
  labs(x = "Numlen Conversion Factor", y = "Count",
       caption = "Numlen Conversion Factor = survdat$abundance / sum(numlen)") +
  theme(legend.position = "bottom")

If you plot the relationships between the length weight biomass and the survdat$BIOMASS column and account for this adjustment then the relationship looks like this:

convers_check <- haddock_lw %>% 
  filter(svvessel == "HB") %>% 
  mutate(sum_weight_undadj = ind_weight_kg * numlen) %>% 
  group_by(id, est_year, season, svvessel, catchsex) %>% 
  summarise(lw_bio = sum(sum_weight_kg),
            lw_unadj = sum(sum_weight_undadj),
            biomass_col = sum(biom_per_lclass))

convers_check %>% 
  ggplot() +
  geom_point(aes(biomass_col, lw_unadj, 
             color = "Numlen Un-Adjusted")) +
  geom_point(aes(biomass_col, lw_bio, 
             color = "Numlen Adjusted")) +
  geom_abline(slope = 1, intercept = 0, linetype = 2, color  = "darkblue") +
  facet_grid(season~.) +
  scale_color_gmri() +
  labs(x = "Weight from survdat$BIOMASS", y = "Weight from LW relationship",
       caption = "Dashed line depicts 1:1 relationship")

#Remaining Questions

Why for Fall Bigelow stations is there a consistently higher conversion rate?
What situations would make it so consistently centered around ~4?
Should we be using the conversion, or un-adjusted numlen

A work by Adam A. Kemberling

Akemberling@gmri.org

Checking Haddock Biomass Discrepancies

Adam A. Kemberling

2021-07-27