Hide the code
library(terra)
library(sf)
library(tidymodels)
library(ranger)
library(dplyr)
library(spatialsample)
library(waywiser)
library(vip)library(terra)
library(sf)
library(tidymodels)
library(ranger)
library(dplyr)
library(spatialsample)
library(waywiser)
library(vip)Read data:
trainingdata <- sf::st_read("https://github.com/LOEK-RS/FOSSGIS2025-examples/raw/refs/heads/main/data/temp_train.gpkg")
predictors <- terra::rast("https://github.com/LOEK-RS/FOSSGIS2025-examples/raw/refs/heads/main/data/predictors.tif")Prepare data:
trainDat <- sf::st_as_sf(terra::extract(predictors, trainingdata, bind = TRUE))
predictor_names <- names(predictors) # Extract predictor names from the raster
response_name <- "temp"Compared to caret, no dropping of the geometries is required
First, we train a random forest model. This is done by defining a recipe and a model, and then combining them into a workflow. Such a workflow can then be used to fit the model to the data.
# Define the recipe
formula <- as.formula(paste(
response_name,
"~",
paste(predictor_names, collapse = " + ")
))
recipe <- recipes::recipe(formula, data = trainDat)
rf_model <- parsnip::rand_forest(trees = 100, mode = "regression") |>
set_engine("ranger", importance = "impurity")
# Create the workflow
workflow <- workflows::workflow() |>
workflows::add_recipe(recipe) |>
workflows::add_model(rf_model)
# Fit the model
rf_fit <- parsnip::fit(workflow, data = trainDat)Now let’s use the model for spatial prediction with terra::predict().
prediction_raster <- terra::predict(predictors, rf_fit)
plot(prediction_raster)Cross-validation requires to specify how the data is split into folds. Here, we define a non-spatial cross-validation with rsample::vfold_cv() and a spatial cross-validation with spatialsample::spatial_block_cv().
random_folds <- rsample::vfold_cv(trainDat, v = 4)
block_folds <- spatialsample::spatial_block_cv(trainDat, v = 4, n = 2)
spatialsample::autoplot(block_folds)# control cross-validation
keep_pred <- tune::control_resamples(save_pred = TRUE, save_workflow = TRUE)Next, we fit the model to the data using cross-validation with tune::fit_resamples().
### Cross-validation
rf_random <- tune::fit_resamples(
workflow,
resamples = random_folds,
control = keep_pred
)
rf_spatial <- tune::fit_resamples(
workflow,
resamples = block_folds,
control = keep_pred
)
### get CV metrics
# rf_spatial$.metrics # metrics from each fold
tune::collect_metrics(rf_random)# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.958 4 0.0990 Preprocessor1_Model1
2 rsq standard 0.898 4 0.0227 Preprocessor1_Model1
tune::collect_metrics(rf_spatial)# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 1.35 4 0.314 Preprocessor1_Model1
2 rsq standard 0.745 4 0.0577 Preprocessor1_Model1
Similar to caret, we first define folds and a definition of train control. The final model, however, is still stored in a separate object.
Next, we tune the model hyperparameters. For this, we change the workflow to include the tuning specifications by using the tune() function inside the model definition and define a grid of hyperparameters to search over. The tuning is done with tune::tune_grid().
# mark two parameters for tuning:
rf_model <- parsnip::rand_forest(
trees = 100,
mode = "regression",
mtry = tune(),
min_n = tune()
) |>
set_engine("ranger", importance = "impurity")
workflow <- update_model(workflow, rf_model)
# define tune grid:
grid_rf <-
grid_space_filling(
mtry(range = c(1, 20)),
min_n(range = c(2, 10)),
size = 30
)
# tune:
rf_tuning <- tune_grid(
workflow,
resamples = block_folds,
grid = grid_rf,
control = keep_pred
)The results can be extracted with collect_metrics() and then visualized.
rf_tuning |>
collect_metrics()# A tibble: 60 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 5 rmse standard 1.88 4 0.278 Preprocessor1_Model01
2 1 5 rsq standard 0.610 4 0.100 Preprocessor1_Model01
3 1 9 rmse standard 1.96 4 0.322 Preprocessor1_Model02
4 1 9 rsq standard 0.603 4 0.125 Preprocessor1_Model02
5 2 4 rmse standard 1.71 4 0.328 Preprocessor1_Model03
6 2 4 rsq standard 0.639 4 0.0833 Preprocessor1_Model03
7 2 2 rmse standard 1.59 4 0.263 Preprocessor1_Model04
8 2 2 rsq standard 0.679 4 0.0917 Preprocessor1_Model04
9 3 7 rmse standard 1.53 4 0.311 Preprocessor1_Model05
10 3 7 rsq standard 0.712 4 0.0693 Preprocessor1_Model05
# ℹ 50 more rows
rf_tuning |>
collect_metrics() |>
mutate(min_n = factor(min_n)) |>
ggplot(aes(mtry, mean, color = min_n)) +
geom_line(linewidth = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_viridis_d(option = "plasma", begin = .9, end = 0)Finally, we can extract the best model and use it to get the variable importance and make predictions.
finalmodel <- fit_best(rf_tuning)
finalmodel══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps
── Model ───────────────────────────────────────────────────────────────────────
Ranger result
Call:
ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~19L, x), num.trees = ~100, min.node.size = min_rows(~3L, x), importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
Type: Regression
Number of trees: 100
Sample size: 195
Number of independent variables: 22
Mtry: 19
Target node size: 3
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 0.7877882
R squared (OOB): 0.9011937
imp <- extract_fit_parsnip(finalmodel) |>
vip::vip()
impfinal_pred <- terra::predict(predictors, finalmodel)
plot(final_pred)Still missing: https://stevenpawley.github.io/recipeselectors/
https://applicable.tidymodels.org/, https://docs.ropensci.org/waywiser/
model_aoa <- waywiser::ww_area_of_applicability(
st_drop_geometry(trainDat[, predictor_names]),
importance = vip::vi_model(finalmodel)
)
AOA <- terra::predict(predictors, model_aoa)
plot(AOA$di)plot(AOA$aoa)