Two input paths in RHISEA: run_hisea_all with HISEA-style files or Dataframes
Sosthene Akia
Fisheries and Oceans Canadasosthene.akia@dfo-mpo.ca
Alex Hanke
Fisheries and Oceans Canadaalex.hanke@dfo-mpo.ca
2025-08-19
Source:vignettes/RHISEA_data_type.Rmd
RHISEA_data_type.Rmd
Introduction
This vignette demonstrates two equivalent ways to run
run_hisea_all()
from the RHISEA
package:
-
Data frame inputs (R objects
baseline
andmixture
already loaded), and -
HISEA file inputs (
hisea.std
andhisea.mix
files).
We run the same analysis with LDA classifier and compare the results (means, bootstrap distributions, etc.). The vignette explains the differences, shows reproducible code, and provides interpretation guidance.
Required packages
library(RHISEA)
#> RHISEA loaded successfully
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.4 ✔ readr 2.1.5
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ lubridate 1.9.4 ✔ tibble 3.3.0
#> ✔ purrr 1.1.0 ✔ tidyr 1.3.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::combine() masks randomForest::combine()
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ purrr::lift() masks caret::lift()
#> ✖ purrr::map() masks mclust::map()
#> ✖ randomForest::margin() masks ggplot2::margin()
#> ✖ dplyr::select() masks MASS::select()
#> ✖ dplyr::slice() masks xgboost::slice()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
#>
#> Attaching package: 'data.table'
#>
#> The following objects are masked from 'package:lubridate':
#>
#> hour, isoweek, mday, minute, month, quarter, second, wday, week,
#> yday, year
#>
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
#>
#> The following object is masked from 'package:purrr':
#>
#> transpose
library(mclust) # used by some RHISEA internals or for post-analysis
Load the data
# Option A: load .rda saved objects (development)
baseline_file <- system.file("extdata", "baseline.rda", package = "RHISEA")
mixture_file <- system.file("extdata", "mixture.rda", package = "RHISEA")
load(baseline_file) # loads `baseline` data.frame
load(mixture_file) # loads `mixture` data.frame
# Quick checks
stopifnot(is.data.frame(baseline) || is.data.table(baseline))
stopifnot(is.data.frame(mixture) || is.data.table(mixture))
str(baseline)
#> Classes 'data.table' and 'data.frame': 300 obs. of 3 variables:
#> $ d13c : num -7.99 -8.23 -7.99 -7.32 -9.01 -8.19 -8.48 -8.42 -8.02 -7.9 ...
#> $ d18o : num -0.46 -0.58 -0.89 -1.06 -0.89 -1.25 -0.27 -0.72 -0.74 -0.51 ...
#> $ population: Factor w/ 2 levels "East","West": 1 1 1 1 1 1 1 1 1 1 ...
#> - attr(*, ".internal.selfref")=<externalptr>
str(mixture)
#> tibble [4,500 × 2] (S3: tbl_df/tbl/data.frame)
#> $ d13c_ukn: chr [1:4500] "-8.6" "-8.8362287144933909" "-8.48" "-8.6199999999999992" ...
#> $ d18o_ukn: chr [1:4500] "-1.1599999999999999" "-1.3548853367293201" "-0.93" "-1.91" ...
# Ensure variable names expected by run_hisea_all
# baseline should include: d13c, d18o, population
# mixture should include: d13c_ukn, d18o_ukn
Write HISEA formatted files (optional)
If you want to compare with HISEA text format, write files from the data frames:
write_std_from_dataframe(df = baseline, stock_col = "population", var_cols = c("d13c", "d18o"))
write_mix_from_dataframe(df = mixture, var_cols = c("d13c_ukn", "d18o_ukn"))
# files created in working directory: "hisea.std", "hisea.mix"
list.files(pattern = "hisea\\.(std|mix)$")
#> [1] "hisea.mix" "hisea.std"
Analysis parameters
np <- 2
nv <- 2
Nsamps <- 1000 # number of bootstrap/resampling iterations inside run_hisea_all (depends on function)
Nmix <- 100
baseline_file <- "hisea.std"
mixture_file <- "hisea.mix"
stock_labels <- c("East", "West")
stocks_names <- c("East", "West")
resample_baseline <- TRUE
resampled_baseline_sizes <- c(50, 50)
# set seed for reproducibility
seed_val <- 123456
1) Run with Data frame inputs
DataFrame_option <- run_hisea_all(
type = "ANALYSIS",
np = np,
phi_method = "cv", # chosen in your script: cv or standard; we use "cv" here
nv = nv,
resample_baseline = resample_baseline,
resampled_baseline_sizes = resampled_baseline_sizes,
seed_val = seed_val,
nsamps = Nsamps,
Nmix = Nmix,
baseline_input = baseline,
mix_input = mixture,
method_class = "LDA",
stocks_names = stock_labels,
stock_col = "population",
var_cols_std = c("d13c", "d18o"),
var_cols_mix = c("d13c_ukn", "d18o_ukn")
)
#> Warning in run_hisea_all(type = "ANALYSIS", np = np, phi_method = "cv", :
#> resample_baseline only for SIMULATION/BOOTSTRAP; disabling.
# Print summary statistics
print("Data frame input — mean estimates:")
#> [1] "Data frame input — mean estimates:"
print(round(DataFrame_option$mean, 4))
#> RAW COOK COOKC EM ML
#> East 0.3771 0.3402 0.3402 0.3402 0.3217
#> West 0.6229 0.6598 0.6598 0.6598 0.6783
# If the object contains bootstrap results, show a small summary
if (!is.null(DataFrame_option$boot) ) {
cat("\nBootstrap summary (data frame input):\n")
print(summary(DataFrame_option$boot))
}
What this block does
- Runs the analysis using the in-memory
baseline
andmixture
objects. -
DataFrame_option$mean
should contain the estimated mean mixture composition (e.g., estimated proportions for East and West) averaged across resamples. -
DataFrame_option$boot
(if present) might contain bootstrap distribution of estimates — used to compute CI, sd, etc.
2) Run with HISEA-formatted files
HiseaFormat_option <- run_hisea_all(
type = "ANALYSIS",
np = np,
phi_method = "cv",
nv = nv,
resample_baseline = resample_baseline,
resampled_baseline_sizes = resampled_baseline_sizes,
seed_val = seed_val,
nsamps = Nsamps,
Nmix = Nmix,
baseline_input = baseline_file,
mix_input = mixture_file,
method_class = "LDA",
stocks_names = stock_labels
)
#> Warning in run_hisea_all(type = "ANALYSIS", np = np, phi_method = "cv", :
#> resample_baseline only for SIMULATION/BOOTSTRAP; disabling.
print("HISEA file input — mean estimates:")
#> [1] "HISEA file input — mean estimates:"
print(round(HiseaFormat_option$mean, 4))
#> RAW COOK COOKC EM ML
#> East 0.3771 0.3402 0.3402 0.3402 0.3217
#> West 0.6229 0.6598 0.6598 0.6598 0.6783
if (!is.null(HiseaFormat_option$boot) ) {
cat("\nBootstrap summary (HISEA file input):\n")
print(summary(HiseaFormat_option$boot))
}
What to expect
- If
write_std_from_dataframe()
/write_mix_from_dataframe()
produced identical formatted files to the in-memory data, results should be extremely similar. Small differences can arise from formatting, row order, or numeric rounding. - If results differ substantially, see Troubleshooting below.
Compare results: table + plot
This section produces a comparison table and simple plot of the mean estimates and their differences.
# collect means
df_compare <- tibble(
mean_df = as.numeric(DataFrame_option$mean),
mean_file = as.numeric(HiseaFormat_option$mean)
) %>%
mutate(diff = mean_df - mean_file)
kable(df_compare, digits = 4, caption = "Comparison of estimates: Data frame vs HISEA file")
mean_df | mean_file | diff |
---|---|---|
0.3771 | 0.3771 | 0 |
0.6229 | 0.6229 | 0 |
0.3402 | 0.3402 | 0 |
0.6598 | 0.6598 | 0 |
0.3402 | 0.3402 | 0 |
0.6598 | 0.6598 | 0 |
0.3402 | 0.3402 | 0 |
0.6598 | 0.6598 | 0 |
0.3217 | 0.3217 | 0 |
0.6783 | 0.6783 | 0 |
# Plot side-by-side bars
df_plot <- df_compare %>% pivot_longer(cols = c(mean_df, mean_file), names_to = "input_type", values_to = "estimate")
Visualize bootstrap distributions (if available)
If the objects returned contain bootstrap distributions (common when
nsamps
used), plot histograms/density for each stock and
input type:
DataFrame_option_boot <- run_hisea_all(
type = "BOOTSTRAP",
np = np,
phi_method = "cv", # chosen in your script: cv or standard; we use "cv" here
nv = nv,
resample_baseline = resample_baseline,
resampled_baseline_sizes = resampled_baseline_sizes,
seed_val = seed_val,
nsamps = Nsamps,
Nmix = Nmix,
baseline_input = baseline,
mix_input = mixture,
method_class = "LDA",
stocks_names = stock_labels,
stock_col = "population",
var_cols_std = c("d13c", "d18o"),
var_cols_mix = c("d13c_ukn", "d18o_ukn")
)
HiseaFormat_option_boot <- run_hisea_all(
type = "BOOTSTRAP",
np = np,
phi_method = "cv",
nv = nv,
resample_baseline = resample_baseline,
resampled_baseline_sizes = resampled_baseline_sizes,
seed_val = seed_val,
nsamps = Nsamps,
Nmix = Nmix,
baseline_input = baseline_file,
mix_input = mixture_file,
method_class = "LDA",
stocks_names = stock_labels
)
make_boot_df <- function(boot_obj, label){
if (is.null(boot_obj)) return(NULL)
# adapt depending on structure
boot_mat <- as.matrix(boot_obj) # try to coerce
df <- as.data.frame(boot_mat)
names(df) <- stock_labels[1:ncol(df)]
df_long <- pivot_longer(df, cols = everything(), names_to = "stock", values_to = "estimate")
df_long$input <- label
df_long
}
boot_df1 <- make_boot_df(DataFrame_option_boot$boot, "data_frame")
boot_df2 <- make_boot_df(HiseaFormat_option_boot$boot, "hisea_file")
boot_all <- bind_rows(boot_df1, boot_df2)
if (!is.null(boot_all) && nrow(boot_all) > 0) {
ggplot(boot_all, aes(x = estimate)) +
geom_density() +
facet_grid(stock ~ input, scales = "free") +
labs(title = "Bootstrap distributions by stock and input type")
}
Interpretation & Reasoning (how to read results)
Mean estimates (
$mean
): these are the estimated proportions of each stock in the mixture averaged across resamples. Example:mean = c(0.87, 0.13)
would indicate 87% East, 13% West.Bootstrap distributions: provide variability — use to compute 95% CI or sd. If bootstrap distributions between
DataFrame_option
andHiseaFormat_option
overlap strongly and mean differences are tiny, the two input modes are effectively equivalent.