Compare revisions

5d21dfb7 · 5d21dfb7 · 5d21dfb7 · 5d21dfb7 · 5d21dfb7 · 5d21dfb7
--- a/man/plot_counterfactual.Rd
+++ b/man/plot_counterfactual.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/visualisation.R
+\name{plot_counterfactual}
+\alias{plot_counterfactual}
+\title{Prepare Plot Data and Plot Counterfactuals}
+\usage{
+plot_counterfactual(
+  predictions,
+  params,
+  window_size = 14,
+  date_effect_start = NULL,
+  buffer = 0,
+  plot_pred_interval = TRUE
+)
+}
+\arguments{
+\item{predictions}{The data.table containing the predictions (hourly)}
+
+\item{params}{Parameters for plotting, including the target variable.}
+
+\item{window_size}{The window size for the rolling mean (default is 14 days).}
+
+\item{date_effect_start}{A date. Start date of the
+effect that is to be evaluated. The data from this point onwards is disregarded
+for calculating model performance}
+
+\item{buffer}{Integer. An additional, optional buffer window before
+\code{date_effect_start} to account for uncertainty in the effect start point.
+Disregards additional buffer data points for model evaluation.
+Use \code{buffer=0} for no buffer.}
+
+\item{plot_pred_interval}{Boolean. If \code{TRUE}, shows a grey band of the prediction
+interval.}
+}
+\value{
+A ggplot object with the counterfactual plot. Can be adjusted further,
+e.g. set limits for the y-axis for better visualisation.
+}
+\description{
+Smooths the predictions using a rolling mean, prepares the data for plotting,
+and generates the counterfactual plot for the application window. Data before
+the red box are reference window, red box is buffer and values after black,
+dotted line are effect window.
+}
+\details{
+The optional grey ribbon is a prediction interval for the hourly values. The
+interpretation for a 90\% prediction interval (to be defined in \code{alpha} parameter
+of \code{\link[=run_counterfactual]{run_counterfactual()}}) is that 90\% of the true hourly values
+(not the rolled means) lie within the grey band. This might be helpful for
+getting an idea of the variance of the data and predictions.
+}
--- a/man/plot_station_measurements.Rd
+++ b/man/plot_station_measurements.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/visualisation.R
+\name{plot_station_measurements}
+\alias{plot_station_measurements}
+\title{Descriptive plot of daily time series data}
+\usage{
+plot_station_measurements(
+  env_data,
+  variables,
+  years = NULL,
+  smoothing_factor = 1
+)
+}
+\arguments{
+\item{env_data}{A data table of measurements of one air quality measurement station.
+The data should contain the following columns:
+\describe{
+\item{Station}{Station identifier where the data was collected.}
+\item{Komponente}{The environmental component being measured
+(e.g., temperature, NO2).}
+\item{Wert}{The measured value of the component.}
+\item{date}{The timestamp for the observation,
+formatted as a Date-Time object in the format
+\code{"YYYY-MM-DD HH:MM:SS"} (e.g., "2010-01-01 07:00:00").}
+\item{Komponente_txt}{A textual description or label for the component.}
+}}
+
+\item{variables}{list of variables to plot. Must be in \code{env_data$Komponente}.
+Meteorological variables can be obtained from params.yaml.}
+
+\item{years}{Optional. A numeric vector, list, or a range specifying the
+years to restrict the plotted data.
+You can provide:
+\itemize{
+\item A single year: \code{years = 2020}
+\item A numeric vector of years: \code{years = c(2019, 2020, 2021)}
+\item A range of years: \code{years = 2019:2021}
+If not provided, data for all available years will be used.
+}}
+
+\item{smoothing_factor}{A number that defines the magnitude of smoothing.
+Default is 1. Smaller numbers correspond to less smoothing, larger numbers to more.}
+}
+\description{
+This function produces descriptive time-series plots with smoothing
+for the meteorological and potential target variables that were measured at a station.
+}
--- a/man/prepare_data_for_modelling.Rd
+++ b/man/prepare_data_for_modelling.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/modelling.R
+\name{prepare_data_for_modelling}
+\alias{prepare_data_for_modelling}
+\title{Prepare Data for Training a model}
+\usage{
+prepare_data_for_modelling(env_data, params)
+}
+\arguments{
+\item{env_data}{A data table in long format.
+Must include the following columns:
+\describe{
+\item{Station}{Station identifier for the data.}
+\item{Komponente}{The environmental component being measured
+(e.g., temperature, NO2).}
+\item{Wert}{The measured value of the component.}
+\item{date}{Timestamp as \code{POSIXct} object in \verb{YYYY-MM-DD HH:MM:SS} format.}
+\item{Komponente_txt}{A textual description of the component.}
+}}
+
+\item{params}{A list of modelling parameters loaded from \code{params.yaml}.
+Must include:
+\describe{
+\item{meteo_variables}{A vector of meteorological variable names.}
+\item{target}{The name of the target variable.}
+}}
+}
+\value{
+A \code{data.table} in wide format, with columns:
+\code{date}, one column per component, and temporal features
+like \code{date_unix}, \code{day_julian}, \code{weekday}, and \code{hour}.
+}
+\description{
+Prepares environmental data by filtering for relevant components,
+converting the data to a wide format, and adding temporal features. Should be
+called before
+\code{\link[ubair:split_data_counterfactual]{split_data_counterfactual()}}
+}
+\examples{
+env_data <- data.table::data.table(
+  Station = c("StationA", "StationA", "StationA"),
+  Komponente = c("NO2", "TMP", "NO2"),
+  Wert = c(50, 20, 40),
+  date = as.POSIXct(c("2023-01-01 10:00:00", "2023-01-01 11:00:00", "2023-01-02 12:00:00"))
+)
+params <- list(meteo_variables = c("TMP"), target = "NO2")
+prepared_data <- prepare_data_for_modelling(env_data, params)
+print(prepared_data)
+
+}
--- a/man/rescale_predictions.Rd
+++ b/man/rescale_predictions.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_preprocessing.R
+\name{rescale_predictions}
+\alias{rescale_predictions}
+\title{Rescale predictions to original scale.}
+\usage{
+rescale_predictions(scale_result, dt_predictions)
+}
+\arguments{
+\item{scale_result}{A list object returned by \code{\link[=scale_data]{scale_data()}},
+containing the means and standard deviations used for scaling.}
+
+\item{dt_predictions}{A data frame containing the predictions,
+including columns \code{prediction}, \code{prediction_lower}, \code{prediction_upper}.}
+}
+\value{
+A data frame with the predictions and numeric columns rescaled back
+to their original scale.
+}
+\description{
+This function rescales the predicted values (\code{prediction}, \code{prediction_lower},
+\code{prediction_upper}). The scaling is reversed using the means and
+standard deviations that were saved from the training data. It is the inverse
+function to \code{\link[=scale_data]{scale_data()}} and should be used only in combination.
+}
+\examples{
+\dontrun{
+scale_res <- scale_data(train_data = train, apply_data = apply)
+res <- run_fnn(train = scale_res$train, test = scale_res$apply, params)
+dt_predictions <- res$dt_predictions
+rescaled_predictions <- rescale_predictions(scale_res, dt_predictions)
+}
+}
--- a/man/retrend_predictions.Rd
+++ b/man/retrend_predictions.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_preprocessing.R
+\name{retrend_predictions}
+\alias{retrend_predictions}
+\title{Restors the trend in the prediction}
+\usage{
+retrend_predictions(dt_predictions, trend, log_transform = FALSE)
+}
+\arguments{
+\item{dt_predictions}{Dataframe of predictions with columns \code{value},
+\code{prediction}, \code{prediction_lower}, \code{prediction_upper}}
+
+\item{trend}{lm object generated by \code{\link[=detrend]{detrend()}}}
+
+\item{log_transform}{Returns values to solution space, if they have been
+log transformed during detrending. Use only in combination with \code{log_transform}
+parameter in detrend function.}
+}
+\value{
+Retrended dataframe with same structure as \code{dt_predictions}
+which is returned by any of the run_model() functions.
+}
+\description{
+Takes a dataframe of predictions as returned by any of
+the 'run_model' functions and restores a trend which was previously
+removed via \code{\link[=detrend]{detrend()}}. This is necessary for the predictions
+and the true values to have the same units. The function is basically
+the inverse function to \code{\link[=detrend]{detrend()}} and should only be used in
+combination with it.
+}
+\examples{
+\dontrun{
+detrended_list <- detrend(split_data,
+  mode = detrending_function,
+  log_transform = log_transform
+)
+trend <- detrended_list$model
+detrended_train <- detrended_list$train
+detrended_apply <- detrended_list$apply
+detrended_train <- detrended_train \%>\% select(value, dplyr::any_of(variables))
+result <- run_lightgbm(
+  train = detrended_train,
+  test = detrended_apply,
+  model_params = params$lightgbm,
+  alpha = 0.9,
+  calc_shaps = FALSE
+)
+retrended_predictions <- retrend_predictions(result$dt_predictions, trend)
+}
+}
--- a/man/run_counterfactual.Rd
+++ b/man/run_counterfactual.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/counterfactual_model.R
+\name{run_counterfactual}
+\alias{run_counterfactual}
+\title{Full counterfactual simulation run}
+\usage{
+run_counterfactual(
+  split_data,
+  params,
+  detrending_function = "none",
+  model_type = "rf",
+  alpha = 0.9,
+  log_transform = FALSE,
+  calc_shaps = FALSE
+)
+}
+\arguments{
+\item{split_data}{List of two named dataframes called train and apply}
+
+\item{params}{A list of parameters that define the following:
+\describe{
+\item{meteo_variables}{A character vector specifying the names of the
+meteorological variables used as inputs.}
+\item{model}{A list of hyperparameters for training the chosen model. Name of this list
+and its parameters depend on the chosen models. See \code{\link[=run_dynamic_regression]{run_dynamic_regression()}},
+\code{\link[=run_lightgbm]{run_lightgbm()}}, \code{\link[=run_rf]{run_rf()}} and \code{\link[=run_fnn]{run_fnn()}} functions for details}
+}}
+
+\item{detrending_function}{String which defines type of trend to remove.
+Options are "linear","quadratic", "exponential", "spline", "none". See \code{\link[=detrend]{detrend()}}
+and \code{\link[=retrend_predictions]{retrend_predictions()}} for details.}
+
+\item{model_type}{String to decide which model to use. Current options random
+forest "rf", gradient boosted decision trees "lightgbm", "dynamic_regression" and feedforward neural network "fnn"}
+
+\item{alpha}{Confidence level of the prediction interval between 0 and 1.}
+
+\item{log_transform}{If TRUE, uses log transformation during detrending and
+retrending. For details see \code{\link[=detrend]{detrend()}} documentation}
+
+\item{calc_shaps}{Boolean value. If TRUE, calculate SHAP values for the
+method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
+\code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
+The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
+test data.}
+}
+\value{
+Data frame of predictions and model
+}
+\description{
+Chains detrending, training of a selected model, prediction and retrending together
+for ease of use. See documentation of individual functions for details.
+}
+\examples{
+\dontrun{
+split_data <- split_data_counterfactual(
+  dt_prepared, training_start,
+  training_end, application_start, application_end
+)
+res <- run_counterfactual(split_data, params, detrending_function = "linear")
+prediction <- res$retrended_predictions
+random_forest_model <- res$model
+}
+}
--- a/man/run_dynamic_regression.Rd
+++ b/man/run_dynamic_regression.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/counterfactual_model.R
+\name{run_dynamic_regression}
+\alias{run_dynamic_regression}
+\title{Run the dynamic regression model}
+\usage{
+run_dynamic_regression(train, test, params, alpha, calc_shaps)
+}
+\arguments{
+\item{train}{Dataframe of train data as returned by the \code{\link[=split_data_counterfactual]{split_data_counterfactual()}}
+function.}
+
+\item{test}{Dataframe of test data as returned by the \code{\link[=split_data_counterfactual]{split_data_counterfactual()}}
+function.}
+
+\item{params}{list of hyperparameters to use in dynamic_regression call. Only uses ntrain to specify
+the number of data points to use for training. Default is 8760 which results in
+1 year of hourly data}
+
+\item{alpha}{Confidence level of the prediction interval between 0 and 1.}
+
+\item{calc_shaps}{Boolean value. If TRUE, calculate SHAP values for the
+method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
+\code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
+The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
+test data.}
+}
+\value{
+Data frame of predictions and model
+}
+\description{
+This function trains a dynamic regression model with fourier transformed temporal features
+and meteorological variables as external regressors on the
+specified training dataset and makes predictions on the test dataset in a
+counterfactual scenario. This is referred to as a dynamic regression model in
+\href{https://otexts.com/fpp3/dynamic.html}{Forecasting: Principles and Practise, Chapter 10 - Dynamic regression models}
+}
+\details{
+Note: Runs the dynamic regression model for individualised use with own data pipeline.
+Otherwise use \code{\link[=run_counterfactual]{run_counterfactual()}} to call this function.
+}
--- a/man/run_fnn.Rd
+++ b/man/run_fnn.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/counterfactual_model.R
+\name{run_fnn}
+\alias{run_fnn}
+\title{Train a Feedforward Neural Network (FNN) in a Counterfactual Scenario.}
+\usage{
+run_fnn(train, test, params, calc_shaps)
+}
+\arguments{
+\item{train}{A data frame or tibble containing the training dataset,
+including the target variable (\code{value})
+and meteorological variables specified in \code{params$meteo_variables}.}
+
+\item{test}{A data frame or tibble containing the test dataset on which
+predictions will be made,
+using the same meteorological variables as in the training dataset.}
+
+\item{params}{A list of parameters that define the following:
+\describe{
+\item{meteo_variables}{A character vector specifying the names of the
+meteorological variables used as inputs.}
+\item{fnn}{A list of hyperparameters for training the feedforward neural
+network, including:
+\itemize{
+\item \code{activation_fun}: The activation function for the hidden
+layers (e.g., "sigmoid", "tanh").
+\item \code{momentum}: The momentum factor for training.
+\item \code{learningrate_scale}: Factor for adjusting learning rate.
+\item \code{output_fun}: The activation function for the output layer
+\item \code{batchsize}: The size of the batches during training.
+\item \code{hidden_dropout}: Dropout rate for the hidden layers to
+prevent overfitting.
+\item \code{visible_dropout}: Dropout rate for the input layer.
+\item \code{hidden_layers}: A vector specifying the number of neurons
+in each hidden layer.
+\item \code{num_epochs}: Number of epochs (iterations) for training.
+\item \code{learning_rate}: Initial learning rate.
+}
+}
+}}
+
+\item{calc_shaps}{Boolean value. If TRUE, calculate SHAP values for the
+method used and format them so they can be visualised with
+\code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
+\code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
+The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
+test data.}
+}
+\value{
+A list with three elements:
+\describe{
+\item{\code{dt_predictions}}{A data frame containing the test data along
+with the predicted values:
+\describe{
+\item{\code{prediction}}{The predicted values from the FNN model.}
+\item{\code{prediction_lower}}{The same predicted values, as no
+quantile model is available yet for FNN.}
+\item{\code{prediction_upper}}{The same predicted values, as no
+quantile model is available yet for FNN.}
+}
+}
+\item{\code{model}}{The trained FNN model object from the
+\code{deepnet::nn.train()} function.}
+\item{\code{importance}}{SHAP importance values (if
+\code{calc_shaps = TRUE}). Otherwise, \code{NULL}.}
+}
+}
+\description{
+Trains a feedforward neural network (FNN) model on the
+specified training dataset and makes predictions on the test dataset in a
+counterfactual scenario. The model uses meteorological variables and
+sin/cosine-transformed features. Scales the data before training and rescales
+predictions, as the model does not converge with unscaled data.
+}
+\details{
+This function provides flexibility for users with their own data pipelines
+or workflows. For a simplified pipeline, consider using
+\code{\link[ubair:run_counterfactual]{run_counterfactual()}}.
+
+Experiment with hyperparameters such as \code{learning_rate},
+\code{batchsize}, \code{hidden_layers}, and \code{num_epochs} to improve
+performance.
+
+Warning: Using many or large hidden layers in combination with a high number
+of epochs can lead to long training times.
+}
--- a/man/run_lightgbm.Rd
+++ b/man/run_lightgbm.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/counterfactual_model.R
+\name{run_lightgbm}
+\alias{run_lightgbm}
+\title{Run gradient boosting model with lightgbm}
+\usage{
+run_lightgbm(train, test, model_params, alpha, calc_shaps)
+}
+\arguments{
+\item{train}{Dataframe of train data as returned by the \code{\link[=split_data_counterfactual]{split_data_counterfactual()}}
+function.}
+
+\item{test}{Dataframe of test data as returned by the \code{\link[=split_data_counterfactual]{split_data_counterfactual()}}
+function.}
+
+\item{model_params}{list of hyperparameters to use in lgb.train call.
+See \code{\link[lightgbm:lgb.train]{lightgbm:lgb.train()}} params argument for details.}
+
+\item{alpha}{Confidence level of the prediction interval between 0 and 1.}
+
+\item{calc_shaps}{Boolean value. If TRUE, calculate SHAP values for the
+method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
+\code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
+The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
+test data.}
+}
+\value{
+List with data frame of predictions and model
+}
+\description{
+This function trains a gradient boosting model (lightgbm) on the
+specified training dataset and makes predictions on the test dataset in a
+counterfactual scenario. The model uses meteorological variables and temporal features.
+}
+\details{
+Note: Runs the gradient boosting model for individualised use with own data pipeline.
+Otherwise use \code{\link[=run_counterfactual]{run_counterfactual()}}  to call this function.
+}
--- a/man/run_rf.Rd
+++ b/man/run_rf.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/counterfactual_model.R
+\name{run_rf}
+\alias{run_rf}
+\title{Run random forest model with ranger}
+\usage{
+run_rf(train, test, model_params, alpha, calc_shaps)
+}
+\arguments{
+\item{train}{Dataframe of train data as returned by the \code{\link[=split_data_counterfactual]{split_data_counterfactual()}}
+function.}
+
+\item{test}{Dataframe of test data as returned by the \code{\link[=split_data_counterfactual]{split_data_counterfactual()}}
+function.}
+
+\item{model_params}{list of hyperparameters to use in ranger call. See \code{\link[ranger:ranger]{ranger:ranger()}} for options.}
+
+\item{alpha}{Confidence level of the prediction interval between 0 and 1.}
+
+\item{calc_shaps}{Boolean value. If TRUE, calculate SHAP values for the
+method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
+\code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
+The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
+test data.}
+}
+\value{
+List with data frame of predictions and model
+}
+\description{
+This function trains a random forest model (ranger) on the
+specified training dataset and makes predictions on the test dataset in a
+counterfactual scenario. The model uses meteorological variables and temporal features.
+}
+\details{
+Note: Runs the random forest model for individualised use with own data pipeline.
+Otherwise use \code{\link[=run_counterfactual]{run_counterfactual()}}  to call this function.
+}
--- a/man/sample_data_DESN025.Rd
+++ b/man/sample_data_DESN025.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sample_data_DESN025.R
+\docType{data}
+\name{sample_data_DESN025}
+\alias{sample_data_DESN025}
+\title{Environmental Data for Modelling from station DESN025 in Leipzig-Mitte.}
+\format{
+\subsection{sample_data_DESN025}{
+
+A data table with the following columns:
+\describe{
+\item{Station}{Station identifier where the data was collected.}
+\item{Komponente}{The environmental component being measured
+(e.g., temperature, NO2).}
+\item{Wert}{The measured value of the component.}
+\item{date}{The timestamp for the observation, formatted as a Date-Time
+object in the format
+\code{"YYYY-MM-DD HH:MM:SS"} (e.g., "2010-01-01 07:00:00").}
+\item{Komponente_txt}{A textual description or label for the component.}
+}
+
+The dataset is structured in a long format and is prepared for further
+transformation into a wide format for modelling.
+}
+}
+\source{
+Umweltbundesamt
+}
+\usage{
+sample_data_DESN025
+}
+\description{
+A dataset containing environmental measurements collected at station in
+Leipzig Mitte with observations of different environmental components over
+time. This data is used for environmental modelling tasks, including
+meteorological variables and other targets.
+}
+\examples{
+\dontrun{
+params <- load_params("path/to/params.yaml")
+dt_prepared <- prepare_data_for_modelling(sample_data_DESN025, params)
+}
+}
+\keyword{datasets}
--- a/man/scale_data.Rd
+++ b/man/scale_data.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_preprocessing.R
+\name{scale_data}
+\alias{scale_data}
+\title{Standardize Training and Application Data}
+\usage{
+scale_data(train_data, apply_data)
+}
+\arguments{
+\item{train_data}{A data frame containing the training dataset to be
+standardized. It must contain numeric columns.}
+
+\item{apply_data}{A data frame  containing the dataset to which the scaling
+from \code{train_data} will be applied.}
+}
+\value{
+A list containing the following elements:
+\item{train}{The standardized training data.}
+\item{apply}{The \code{apply_data} scaled using the means and standard deviations
+from the \code{train_data}.}
+\item{means}{The means of the numeric columns in \code{train_data}.}
+\item{sds}{The standard deviations of the numeric columns in \code{train_data}.}
+}
+\description{
+This function standardizes numeric columns of the \code{train_data} and applies
+the same scaling (mean and standard deviation) to the corresponding columns
+in \code{apply_data}. It returns the standardized data along with the scaling
+parameters (means and standard deviations). This is particularly important
+for neural network approaches as they tend to be numerically unstable and
+deteriorate otherwise.
+}
+\examples{
+\dontrun{
+scale_result <- scale_data(
+  train_data = detrended_list$train,
+  apply_data = detrended_list$apply, scale = TRUE
+)
+scaled_train <- scale_result$train
+scaled_apply <- scale_result$apply
+}
+}
--- a/man/split_data_counterfactual.Rd
+++ b/man/split_data_counterfactual.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/modelling.R
+\name{split_data_counterfactual}
+\alias{split_data_counterfactual}
+\title{Split Data into Training and Application Datasets}
+\usage{
+split_data_counterfactual(dt_prepared, application_start, application_end)
+}
+\arguments{
+\item{dt_prepared}{The prepared data table.}
+
+\item{application_start}{The start date(date object) for the application
+period of the business-as-usual simulation. This coincides with the start of
+the reference window.
+Can be created by e.g. lubridate::ymd("20191201")}
+
+\item{application_end}{The end date(date object)  for the application period
+of the business-as-usual simulation. This coincides with the end of
+the effect window.
+Can be created by e.g. lubridate::ymd("20191201")}
+}
+\value{
+A list with two elements:
+\describe{
+\item{train}{Data outside the application period.}
+\item{apply}{Data within the application period.}
+}
+}
+\description{
+Splits prepared data into training and application datasets based on
+specified date ranges for a business-as-usual scenario. Data before
+\code{application_start} and after \code{application_end} is used as training data,
+while data within the date range is used for application.
+}
+\examples{
+dt_prepared <- data.table::data.table(
+  date = as.Date(c("2023-01-01", "2023-01-05", "2023-01-10")),
+  value = c(50, 60, 70)
+)
+result <- split_data_counterfactual(
+  dt_prepared,
+  application_start = as.Date("2023-01-03"),
+  application_end = as.Date("2023-01-08")
+)
+print(result$train)
+print(result$apply)
+}
--- a/publiccode.yml
+++ b/publiccode.yml
+publiccodeYmlVersion: '1.0'
+name: 'ubair'
+url: 'https://gitlab.opencode.de/uba-ki-lab/ubair'
+releaseDate: '2025-01-15'
+softwareVersion: '1.1.0'
+platforms:
+  - linux
+  - windows
+categories:
+  - data-visualization
+  - data-analytics
+  - data-collection
+  - predictive-analysis
+developmentStatus: stable
+softwareType: 'library'
+description:
+  de:
+    genericName: 'ubair'
+    shortDescription: 'R-Paket zur Untersuchung der Auswirkungen externer Bedingungen auf die Luftqualität.'
+    longDescription: >-
+      ubair ist ein Paket für die Statistiksoftware R, zur Untersuchung von Auswirkungen externer Faktoren, wie Verkehrsbeschränkungen,
+      Umweltunfällen und politische Maßnahmen, auf die Luftqualität zu analysieren und zu visualisieren.
+      Ziel ist es, Experten einen transparenten Vergleich von verschiedenen Modellierungsansätzen zu ermöglichen
+      und datengestützte Auswertungen für die Politikberatung zu unterstützen.
+    features:
+      - Visualierung von Stationsmessungen
+      - Business-as-Usual-Simulation für Luftqualität
+      - Schätzung von Effektgröße für einen Effektzeitraum
+      - Vergleich von Ergebnissen von statistischen, Machine-Learning Modellen und Neuronalen Netzen
+  en:
+    genericName: 'ubair'
+    shortDescription: 'R package for investigation of external conditions on air quality'
+    longDescription: >-
+      ubair is an R package for statistical investigation of the impact of external conditions on air quality:
+      it uses the statistical software R to analyze and visualize the impact of external factors, such as traffic
+      restrictions, hazards, and political measures, on air quality. It aims to provide experts with a transparent
+      comparison of modeling approaches and to support data-driven evaluations for policy advisory purposes.
+
+    features:
+      - visualization of station measurements
+      - Business-as-usual simulation of air quality measures
+      - Estimation of effect size for an effect window
+      - comparison of statistical, machine learning and deep learning model results
+
+legal:
+  license: 'GPL-3'
+maintenance:
+  type: internal
+  contacts:
+    - name: 'UBA KI-Lab'
+      email: 'ki-anwendungslabor@uba.de'
+      affiliation: Umweltbundesamt
+      website: 'https://www.umweltbundesamt.de/themen/digitalisierung/anwendungslabor-fuer-kuenstliche-intelligenz-big'
+logo: ki-lab-logo.png
+localisation:
+  localisationReady: yes
+  availableLanguages:
+    - de
+    - en
--- a/renv.lock
+++ b/renv.lock
--- a/renv/.gitignore
+++ b/renv/.gitignore
+library/
+local/
+cellar/
+lock/
+python/
+sandbox/
+staging/
--- a/renv/activate.R
+++ b/renv/activate.R
--- a/renv/settings.json
+++ b/renv/settings.json
+{
+  "bioconductor.version": null,
+  "external.libraries": [],
+  "ignored.packages": [],
+  "package.dependency.fields": [
+    "Imports",
+    "Depends",
+    "LinkingTo"
+  ],
+  "ppm.enabled": null,
+  "ppm.ignored.urls": [],
+  "r.version": null,
+  "snapshot.type": "implicit",
+  "use.cache": true,
+  "vcs.ignore.cellar": true,
+  "vcs.ignore.library": true,
+  "vcs.ignore.local": true,
+  "vcs.manage.ignores": true
+}
--- a/tests/testthat.R
+++ b/tests/testthat.R
+# This file is part of the standard setup for testthat.
+# It is recommended that you do not modify it.
+#
+# Where should you do additional test configuration?
+# Learn more about the roles of various files in:
+# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
+# * https://testthat.r-lib.org/articles/special-files.html
+
+library(testthat)
+library(ubair)
+
+test_check("ubair")
--- a/tests/testthat/test-data_cleaning.R
+++ b/tests/testthat/test-data_cleaning.R
+# In tests/testthat/test-clean_data.R
+
+test_that("clean_data works correctly with daily aggregation", {
+  # Create a mock data.table
+  env_data <- data.table::data.table(
+    date = as.POSIXct(c(
+      "2021-01-01 00:00:00", "2021-01-01 01:00:00",
+      "2021-01-01 02:00:00"
+    )),
+    Station = "TEST001",
+    part = "test",
+    Komponente = "TMP",
+    Komponente_txt = "Temperature",
+    Wert = c(10, 20, 30)
+  )
+
+  # Run clean_data function with daily aggregation
+  cleaned_data <- clean_data(env_data, "TEST001",
+    aggregate_daily = TRUE
+  )
+
+  # Check if the cleaned data has the expected structure and content
+  expect_s3_class(cleaned_data, "data.table")
+  expect_true(all(c(
+    "Station", "Komponente", "Komponente_txt", "day",
+    "year", "Wert"
+  )
+  %in% colnames(cleaned_data)))
+  expect_equal(nrow(cleaned_data), 1) # Aggregated to one row
+  expect_equal(cleaned_data$Wert, 20) # Mean of 10, 20, 30
+  expect_equal(cleaned_data$day[1], as.POSIXct("2021-01-01"))
+  expect_equal(cleaned_data$year[1], 2021)
+})
+
+test_that("clean_data works correctly without daily aggregation", {
+  # Create a mock data.table
+  env_data <- data.table::data.table(
+    date = as.POSIXct(c(
+      "2021-01-01 00:00:00", "2021-01-01 01:00:00",
+      "2021-01-01 02:00:00"
+    )),
+    Station = "TEST001",
+    part = "test",
+    Komponente = "TMP",
+    Komponente_txt = "Temperature",
+    Wert = c(10, 20, 30)
+  )
+
+  # Run clean_data function without daily aggregation
+  cleaned_data <- clean_data(env_data, "TEST001",
+    aggregate_daily = FALSE
+  )
+
+  # Check if the cleaned data has the expected structure and content
+  expect_s3_class(cleaned_data, "data.table")
+  expect_true(all(c(
+    "date", "Station", "Komponente", "Komponente_txt",
+    "Wert", "year"
+  ) %in% colnames(cleaned_data)))
+  expect_equal(nrow(cleaned_data), 3) # No aggregation
+})
+
+test_that("clean_data filters by station correctly", {
+  env_data <- data.table::data.table(
+    date = as.POSIXct(c("2021-01-01 00:00:00", "2021-01-01 01:00:00")),
+    Station = c("TEST001", "TEST002"),
+    part = "test",
+    Komponente = "TMP",
+    Komponente_txt = "Temperature",
+    Wert = c(10, 20)
+  )
+
+  cleaned_data <- clean_data(env_data, "TEST001",
+    aggregate_daily = FALSE
+  )
+
+  expect_equal(unique(cleaned_data$Station), "TEST001")
+})
+
+test_that("clean_data handles empty data.table gracefully", {
+  env_data <- data.table::data.table(
+    date = as.POSIXct(character()),
+    Station = character(),
+    part = character(),
+    Komponente = character(),
+    Komponente_txt = character(),
+    Wert = numeric()
+  )
+
+  cleaned_data <- clean_data(env_data, "TEST001",
+    aggregate_daily = TRUE
+  )
+  expect_equal(nrow(cleaned_data), 0)
+})
No results found