Hide the code
library(terra)
library(sf)
library(tidymodels)
library(ranger)
library(dplyr)
library(spatialsample)
library(waywiser)
library(vip)
library(terra)
library(sf)
library(tidymodels)
library(ranger)
library(dplyr)
library(spatialsample)
library(waywiser)
library(vip)
Read data:
<- sf::st_read("https://github.com/LOEK-RS/FOSSGIS2025-examples/raw/refs/heads/main/data/temp_train.gpkg")
trainingdata <- terra::rast("https://github.com/LOEK-RS/FOSSGIS2025-examples/raw/refs/heads/main/data/predictors.tif") predictors
Prepare data:
<- sf::st_as_sf(terra::extract(predictors, trainingdata, bind = TRUE))
trainDat <- names(predictors) # Extract predictor names from the raster
predictor_names <- "temp" response_name
Compared to caret, no dropping of the geometries is required
First, we train a random forest model. This is done by defining a recipe and a model, and then combining them into a workflow. Such a workflow can then be used to fit the model to the data.
# Define the recipe
<- as.formula(paste(
formula
response_name,"~",
paste(predictor_names, collapse = " + ")
))<- recipes::recipe(formula, data = trainDat)
recipe
<- parsnip::rand_forest(trees = 100, mode = "regression") |>
rf_model set_engine("ranger", importance = "impurity")
# Create the workflow
<- workflows::workflow() |>
workflow ::add_recipe(recipe) |>
workflows::add_model(rf_model)
workflows
# Fit the model
<- parsnip::fit(workflow, data = trainDat) rf_fit
Now let’s use the model for spatial prediction with terra::predict()
.
<- terra::predict(predictors, rf_fit)
prediction_raster plot(prediction_raster)
Cross-validation requires to specify how the data is split into folds. Here, we define a non-spatial cross-validation with rsample::vfold_cv()
and a spatial cross-validation with spatialsample::spatial_block_cv()
.
<- rsample::vfold_cv(trainDat, v = 4)
random_folds <- spatialsample::spatial_block_cv(trainDat, v = 4, n = 2)
block_folds ::autoplot(block_folds) spatialsample
# control cross-validation
<- tune::control_resamples(save_pred = TRUE, save_workflow = TRUE) keep_pred
Next, we fit the model to the data using cross-validation with tune::fit_resamples()
.
### Cross-validation
<- tune::fit_resamples(
rf_random
workflow,resamples = random_folds,
control = keep_pred
)<- tune::fit_resamples(
rf_spatial
workflow,resamples = block_folds,
control = keep_pred
)
### get CV metrics
# rf_spatial$.metrics # metrics from each fold
::collect_metrics(rf_random) tune
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.958 4 0.0990 Preprocessor1_Model1
2 rsq standard 0.898 4 0.0227 Preprocessor1_Model1
::collect_metrics(rf_spatial) tune
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 1.35 4 0.314 Preprocessor1_Model1
2 rsq standard 0.745 4 0.0577 Preprocessor1_Model1
Similar to caret, we first define folds and a definition of train control. The final model, however, is still stored in a separate object.
Next, we tune the model hyperparameters. For this, we change the workflow to include the tuning specifications by using the tune()
function inside the model definition and define a grid of hyperparameters to search over. The tuning is done with tune::tune_grid()
.
# mark two parameters for tuning:
<- parsnip::rand_forest(
rf_model trees = 100,
mode = "regression",
mtry = tune(),
min_n = tune()
|>
) set_engine("ranger", importance = "impurity")
<- update_model(workflow, rf_model)
workflow
# define tune grid:
<-
grid_rf grid_space_filling(
mtry(range = c(1, 20)),
min_n(range = c(2, 10)),
size = 30
)
# tune:
<- tune_grid(
rf_tuning
workflow,resamples = block_folds,
grid = grid_rf,
control = keep_pred
)
The results can be extracted with collect_metrics()
and then visualized.
|>
rf_tuning collect_metrics()
# A tibble: 60 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 5 rmse standard 1.88 4 0.278 Preprocessor1_Model01
2 1 5 rsq standard 0.610 4 0.100 Preprocessor1_Model01
3 1 9 rmse standard 1.96 4 0.322 Preprocessor1_Model02
4 1 9 rsq standard 0.603 4 0.125 Preprocessor1_Model02
5 2 4 rmse standard 1.71 4 0.328 Preprocessor1_Model03
6 2 4 rsq standard 0.639 4 0.0833 Preprocessor1_Model03
7 2 2 rmse standard 1.59 4 0.263 Preprocessor1_Model04
8 2 2 rsq standard 0.679 4 0.0917 Preprocessor1_Model04
9 3 7 rmse standard 1.53 4 0.311 Preprocessor1_Model05
10 3 7 rsq standard 0.712 4 0.0693 Preprocessor1_Model05
# ℹ 50 more rows
|>
rf_tuning collect_metrics() |>
mutate(min_n = factor(min_n)) |>
ggplot(aes(mtry, mean, color = min_n)) +
geom_line(linewidth = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_viridis_d(option = "plasma", begin = .9, end = 0)
Finally, we can extract the best model and use it to get the variable importance and make predictions.
<- fit_best(rf_tuning)
finalmodel finalmodel
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps
── Model ───────────────────────────────────────────────────────────────────────
Ranger result
Call:
ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~19L, x), num.trees = ~100, min.node.size = min_rows(~3L, x), importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
Type: Regression
Number of trees: 100
Sample size: 195
Number of independent variables: 22
Mtry: 19
Target node size: 3
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 0.7877882
R squared (OOB): 0.9011937
<- extract_fit_parsnip(finalmodel) |>
imp ::vip()
vip imp
<- terra::predict(predictors, finalmodel)
final_pred plot(final_pred)
Still missing: https://stevenpawley.github.io/recipeselectors/
https://applicable.tidymodels.org/, https://docs.ropensci.org/waywiser/
<- waywiser::ww_area_of_applicability(
model_aoa st_drop_geometry(trainDat[, predictor_names]),
importance = vip::vi_model(finalmodel)
)
<- terra::predict(predictors, model_aoa)
AOA plot(AOA$di)
plot(AOA$aoa)