data.list.R

# Do this in a separate tmp.R file to check the documentation
# library(devtools)
# document()
# load_all(as.package("../../onlineforecast"))
# ?as.data.list
# ?data.list
#?as.data.list.data.frame


#' Make a data.list of the vectors and data.frames given.
#'
#' See the vignette 'setup-data' on how a data.list must be setup.
#' 
#' It's simply a list of class \code{data.list} holding:
#' 
#'   - vector \code{t}
#' 
#'   - vector(s) of observations
#' 
#'   - data.frames (or matrices) of forecast inputs
#' 
#' 
#' @title Make a data.list
#' @param ... Should hold: time t, observations as vectors and forecasts as data.frames
#' @return a data.list.
#' @examples
#' # Put together a data.list
#' # The time vector
#' time <- seq(ct("2019-01-01"),ct("2019-01-02"),by=3600)
#' # Observations time series (as vector)
#' xobs <- rnorm(length(time))
#' # Forecast input as a data.frame with columns names 'kxx', where 'xx' is the horizon
#' X <- data.frame(matrix(rnorm(length(time)*3), ncol=3))
#' names(X) <- pst("k",1:3)
#' 
#' D <- data.list(t=time, xobs=xobs, X=X)
#'
#' # Check it (see \code{?\link{summary.data.list}})
#' summary(D)
#' 
#' @export
data.list <- function(...) {
    structure(list(...), class = c("data.list","list"))
}


#' Take a subset of a data.list.
#'
#' Different arguments can be given to select the subset. See the examples.
#' 
#' @title Take a subset of a data.list.
#' @param x The data.list to take a subset of.
#' @param subset Given as the integer indexes or a logical vector, or alternatively \code{c(tstart,tend)}, where tstart and tend are either as POSIX or characters.
#' @param nms The names of the variables in \code{x} to be included.
#' @param kseq The k horizons of forecasts to be included.
#' @param lagforecasts Should the forecasts be lagged k steps (thus useful for plotting etc.).
#' @param pattern Regex pattern applied to select the variables in x to be included.
#' @param ... Not implemented.
#' @return a data.list with the subset.
#' @examples
#' # Use the data.list with building heat load 
#' D <- Dbuilding
#' # Take a subset for the example
#' D <- subset(D, 1:10, nms=c("t","Taobs","Ta","Iobs","I"), kseq=1:3)
#' 
#' # Take subset index 2:4
#' subset(D, 2:4)
#' 
#' # Take subset for a period
#' subset(D, c("2010-12-15 02:00","2010-12-15 04:00"))
#' 
#' # Cannot request a variable not there
#' try(subset(D, nms=c("x","Ta")))
#' 
#' # Take specific horizons
#' subset(D, nms=c("I","Ta"), kseq = 1:2)
#' subset(D, nms=c("I","Ta"), kseq = 1)
#' 
#' # Lag the forecasts such that they are aligned in time with observations
#' subset(D, nms=c("Taobs","Ta"), kseq = 2:3, lagforecasts = TRUE)
#' 
#' # The order follows the order in nms
#' subset(D, nms=c("Ta","I"), kseq = 2)
#' 
#' # Return variables mathing a regex
#' subset(D, kseq=2, pattern="^I")
#' 
#' # Take data for Ta and lag the forecasts (good for plotting and fitting a model)
#' X <- subset(Dbuilding, 1:1000, pattern="^Ta", kseq = 10, lagforecasts = TRUE)
#' 
#' # A scatter plot between the forecast and the observations
#' # (try lagforecasts = FALSE and see the difference)
#' plot(X$Ta$k10, X$Taobs)
#'
#' # Fit a model for the 10-step horizon
#' abline(lm(Taobs ~ Ta.k10, as.data.frame(X)), col=2)
#'
#' @export
subset.data.list <- function(x, subset = NA, nms = NA, kseq = NA, lagforecasts = FALSE, pattern = NA, ...) {
    D <- x
    # --------------------------------
    # Set nms if needed (find the columns to take)
    if(is.na(nms[1])){
        nms <- names(D)
    }
    # If a pattern is given then find the columns
    if(!is.na(pattern[1])){
        # If the pattern has an or "|", then split on it to get the right order of the names
        nms <- unlist(sapply(strsplit(pattern, "\\|")[[1]], function(pat){
            grep(pat, names(D), value=TRUE)
        }))
    }
    # --------------------------------
    # Input checks
    # Check if all variables are in nms
    if(!all(nms %in% names(D))){ stop(pst("The variable ",nms[nms %in% names(D)]," is not in D"))}
    #
    if(!is.na(kseq)[1]){
        lapply(1:length(nms), function(i){
            X <- D[[nms[i]]]
            if(class(X)[1] == "data.frame" ){
                # Check if holds forecasts by checking if any name is "kxx"
                if(length(grep("k[[:digit:]]+$", names(X))) > 0){
                    # If it holds forecasts, check that they are all there
                    if( !all(pst("k",kseq) %in% names(X)) ){
                        warning(pst("The variable ",nms[i]," contains ",pst(names(X),collapse=",")," hence doesn't contain all k in kseq = ",pst(kseq,collapse=",")))
                    }
                }
            }
        })
    }
    # --------------------------------
    # If subset is NA then set it
    if(is.na(subset[1])){
        if(is.null(dim(D[[1]]))){
            subset <- 1:length(D[[1]])
        }else{
            subset <- 1:dim(D[[1]])[1]
        }
    }else if(length(subset) == 2){
        if(inherits(subset,c("character","POSIXlt","POSIXct","POSIXt"))){
            # Start and end of a period is given
            subset <- in_range(subset[1], D$t, subset[2])
        }
    }else{
        # Check if a non-meaningful subset is given
        if(inherits(subset,"character")){
            stop("subset cannot be a character, except if it is of length 2 and can be converted in a POSIX, e.g. subset=c('2020-01-01','2020-01-10'. ")
        }
    }
    # Take all horizons k?
    if(is.na(kseq[1])){
        val <- lapply(D[nms], function(X) {
            if (inherits(X,"data.frame")) {
                return(X[subset, , drop=FALSE]) # drop = FALSE needed in case data frame only has 1 column, otherwise this does not return a data frame
            } else {
                return(X[subset])
            }
        })
    }else{
        # Multiple horizons (hence length(kseq) > 1)
        # Take the specified horizons
        val <- lapply(D[nms], function(X) {
            if (inherits(X,"data.frame")) {
                # Check if holds forecasts by checking if any name is "kxx"
                if(length(grep("k[[:digit:]]+$", names(X))) > 0){
                    return(X[subset,pst("k",kseq), drop=FALSE])
                }else{
                    return(X[subset, , drop=FALSE])
                }
            } else {
                return(X[subset])
            }
        })
    }
    # Lag the forecasts k if specified
    if(lagforecasts){
        val <- lapply(val, function(X){
            if(inherits(X,"data.frame") & length(grep("k[[:digit:]]+$",names(X))) > 0) {
                return(lagdf.data.frame(X, lagseq="+k"))
            }else{
                return(X)
            }
        })
    }
    class(val) <- c("data.list","list")
    return(val)
}


#' Converts a data.list to a data.frame.
#'
#' The forecasts in the data.list will result in columns named \code{varname.kxx} in the data.frame.
#' 
#' @title Convert to data.frame
#' @param x The data.list to be converted.
#' @param row.names Not used.
#' @param optional Not used.
#' @param ... Not used.
#' @return A data.frame
#' @examples
#'
#' #' # Use the data.list with building heat load 
#' D <- Dbuilding
#' # Take a subset
#' D <- subset(D, 1:5, nms=c("t","Taobs","Ta","Iobs","I"), kseq=1:3)
#'
#' # Convert to a data.frame, note the names of the forecasts are appended .kxx (i.e. for Ta and I)
#' as.data.frame(D)
#'
#' @export
as.data.frame.data.list <- function(x, row.names=NULL, optional=FALSE, ...){
    # Then convert into a data.frame
    val <- do.call("cbind", x)
    if(inherits(val,"matrix")){
        val <- as.data.frame(val)
    }
    # Fix names of data.frames (i.e. forecasts, if their names are now "kxx", but should be X.kxx)
    i <- grep("^k[[:digit:]]+$", names(val))
    if(length(i) > 0){
        names(val)[i] <- pst(names(x)[i],".",names(val)[i])
    }
    return(val)
}


#' Generate a pairs plot for the vectors in the data.list.
#'
#' A very useful plot for checking what is in the forecasts, how they are synced and match the observations.
#' 
#' @title Generation of pairs plot for a data.list.
#' @param x The data.list from which to plot.
#' @param subset The subset to be included. Passed to \code{\link{subset.data.list}()}.
#' @param nms The names of the variables to be included. Passed to \code{\link{subset.data.list}()}.
#' @param kseq The horizons to be included. Passed to \code{\link{subset.data.list}()}.
#' @param lagforecasts Lag the forecasts such that they are synced with obervations. Passed to \code{\link{subset.data.list}()}.
#' @param pattern Regex pattern to select the included variables. Passed to \code{\link{subset.data.list}()}.
#' @param lower.panel Passed to \code{\link{pairs}()}.
#' @param panel Passed to \code{\link{pairs}()}.
#' @param pch Passed to \code{\link{pairs}()}.
#' @param cex Passed to \code{\link{pairs}()}.
#' @param ... Passed to \code{\link{pairs}()}.
#' @examples
#' # Take a subset for the example
#' D <- subset(Dbuilding, c("2010-12-15","2011-01-15"), pattern="^Ta|^I", kseq=1:3)
#' pairs(D)
#'
#' # If the forecasts and the observations are not aligned in time,
#' # which is easy to see by comparing to the previous plot.
#' pairs(D, lagforecasts=FALSE)
#' # Especially for the solar I syncronization is really important!
#' # Hence if the forecasts were not synced properly, then it can be detected using this type of plot.
#'
#' # Alternatively, lag when taking the subset
#' D <- subset(Dbuilding, c("2010-12-15","2011-01-15"), pattern="^Ta|^I", kseq=1:3, lagforecasts=TRUE)
#' pairs(D, lagforecasts=FALSE)
#'
#' @importFrom graphics panel.smooth pairs
#' @export
pairs.data.list <- function(x, subset = NA, nms = NA, kseq = NA, lagforecasts = TRUE, pattern = NA, lower.panel=NULL, panel=panel.smooth, pch=20, cex=0.7, ...){
    # First take the subset
    X <- as.data.frame(subset(x, subset = subset, nms = nms, kseq = kseq, lagforecasts = lagforecasts, pattern = pattern))
    #
    pairs(X, lower.panel=lower.panel, panel=panel, pch=pch, cex=cex, ...)
}


#' Summary including checks of the data.list for appropriate form. 
#'
#' Prints on table form the result of the checks.
#' 
#' @title Summary with checks of the data.list for appropriate form. 
#' @param object The object to be summarized and checked
#' @param printit A boolean deciding if check results tables are printed
#' @param stopit A boolean deciding if the function stop with an error if the check is not ok
#' @param nms A character vector. If given specifies the variables (vectors or matrices) in object to check
#' @param msgextra A character which is added in the printout of an (potential) error message
#' @param ... Not used
#' @return The tables generated.
#'
#' Checking the data.list for appropriate form:
#'
#' A check of the time vector t, which must have equidistant time points and no NAs.
#'
#' Then the results of checks of vectors (observations):
#' 
#'   - NAs: Proportion of NAs
#' 
#'   - length: Same length as the 't' vector?
#' 
#'   - class: The class of the vector
#' 
#' Then the results of checking data.frames and matrices (forecasts):
#' 
#'   - maxHorizonNAs: The proportion of NAs for the horizon (i.e. column) with the highest proportion of NAs
#' 
#'   - meanNAs: The proportion of NAs of the entire matrix
#' 
#'   - nrow: Same length as the 't' vector?
#' 
#'   - colnames: Columns must be names 'kx', where 'x' is the horizon (e.g. k12 is 12-step horizon)
#' 
#'   - sameclass: Error if not all columns are the same class
#' 
#'   - class: Prints the class of the columns if they are all the same
#' 
#' @examples
#' 
#' summary(Dbuilding)
#' 
#' # Some NAs in k1 forecast
#' D <- Dbuilding
#' D$Ta$k1[1:1500] <- NA
#' summary(D)
#'
#' # Vector with observations not same length as t throws error
#' D <- Dbuilding
#' D$heatload <- D$heatload[1:10]
#' try(summary(D))
#' 
#' # Forecasts wrong count
#' D <- Dbuilding
#' D$Ta <- D$Ta[1:10, ]
#' try(summary(D))
#' 
#' # Wrong column names
#' D <- Dbuilding
#' names(D$Ta)[4] <- "xk"
#' names(D$Ta)[2] <- "x2"
#' try(summary(D))
#' 
#' # No column names
#' D <- Dbuilding
#' names(D$Ta) <- NULL
#' try(summary(D))
#' 
#' # Don't stop or only print if stopped 
#' onlineforecast:::summary.data.list(D, stopit=FALSE)
#' try(onlineforecast:::summary.data.list(D, printit=FALSE))
#'
#' # Only check for specified variables
#' # (e.g. do like this in model functions to check only variables used in model)
#' onlineforecast:::summary.data.list(D, nms=c("heatload","I"))
#' 
#' @export
summary.data.list <- function(object, printit=TRUE, stopit=TRUE, nms=names(object), msgextra="", ...){
    D <- object

    # The final message
    msg <- NULL

    # Check the time vector
    if(!"t" %in% names(D)){ msg <- c(msg,"'t' is missing in the data.list: It must be a vector of equidistant time points (can be an integer, but preferably POSIXct class with tz 'GMT' or 'UTC'.)")}
    if(length(D$t) > 1){
        if(length(unique(diff(D$t))) != 1){ msg <- c(msg,"'t' is not equidistant or have NA values.") }
    }

    # Which elements are data.frame or matrix?
    isMatrix <- sapply(D, function(x){ inherits(x,c("matrix","data.frame")) })
    
    # Vectors check
    vecseq <- which(!isMatrix  &  names(isMatrix) != "t"  & names(isMatrix) %in% nms)
    Observations <- NA
    if(length(vecseq) > 0){
        vecchecks <- c("NAs","length","class")
        Observations <- data.frame(matrix("ok", nrow=length(vecseq), ncol=length(vecchecks), dimnames=list(pst("$",names(vecseq)),vecchecks)), stringsAsFactors=FALSE)
        #
        for(i in 1:length(vecseq)){
            #
            nm <- names(vecseq)[i]
            # NAs
            NAs <- round(max(sum(is.na(D[nm])) / length(D[nm])))
            Observations$NAs[i] <- pst(NAs,"%")
            # Check the length
            if(length(D[[nm]]) != length(D$t)){
                Observations$length[i] <- "ERROR"
                msg <- c(msg,pst(rownames(Observations)[i]," (length ",length(D[[nm]]),"), not same length as t (length ",length(D$t),")"))
            }
            # Its class
            Observations$class[i] <- class(D[[nm]])
        }
    }

    # Forecasts check
    dfseq <- which(isMatrix  &  names(isMatrix) %in% nms)
    Forecasts <- NA
    if(length(dfseq) > 0){
        dfchecks <- c("maxHorizonNAs","NAs","nrow","colnames","sameclass","class")
        Forecasts <- data.frame(matrix("ok", nrow=length(dfseq), ncol=length(dfchecks), dimnames=list(pst("$",names(dfseq)),dfchecks)), stringsAsFactors=FALSE)
        #
        for(i in 1:length(dfseq)){
            #
            nm <- names(dfseq)[i]
            colnms <- nams(D[[nm]])
            if(is.null(colnms)){
                msg <- c(msg, pst("'",nm,"' has no column names! Columns in forecast matrices must be named 'kx', where x is the horizon (e.g. 'k12' is the column with the 12 step forecast)"))
                Forecasts[i, ] <- rep(NA,ncol(Forecasts))
            }else{
                # max NAs
                tmp <- round(max(sapply(colnms, function(colnm){ 100*sum(is.na(D[[nm]][ ,colnm])) / nrow(D[[nm]]) })))
                Forecasts$maxHorizonNAs[i] <- pst(tmp,"%")
                # Mean NAs
                tmp <- round(mean(sapply(colnms, function(colnm){ 100*sum(is.na(D[[nm]][ ,colnm])) / nrow(D[[nm]]) })))
                Forecasts$NAs[i] <- pst(tmp,"%")
                # Check the number of rows
                if(nrow(D[[nm]]) != length(D$t)){
                    Forecasts$nrow[i] <- "ERROR"
                    msg <- c(msg, pst(nm," has ",nrow(D[[nm]])," rows, must be equal to length of t (n=",length(D$t),")"))
                }
                # Check the colnames, are they unique and all k+integer?
                tmp <- unique(grep("k[[:digit:]]+$",colnms,value=TRUE))
                if(!length(tmp) == length(colnms)){
                    Forecasts$colnames[i] <- "ERROR"
                    msg <- c(msg, pst(nm," has columns named: '",pst(colnms[!(colnms %in% tmp)],collapse="','"),"'. Columns in forecast matrices must be named 'kx', where x is the horizon (e.g. 'k12' is the column with the 12 step forecast)"))
                }
                if(!length(unique(sapply(colnms, function(colnm){ class(D[[nm]][ ,colnm]) }))) == 1){
                    Forecasts$sameclass[i] <- "ERROR"
                    msg <- c(msg, pst(nm," doesn't have same class for all columns"))
                }else{
                    Forecasts$class[i] <- class(D[[nm]][ ,1])
                }
            }
        }
    }

    # Print the results
    if(printit){
        cat("\nLength of time vector 't': ",length(D$t),"\n\n", sep="")
        if(length(vecseq) > 0){
        #    cat("\n- Observation vectors:\n")
            print(Observations)
        }
        if(length(dfseq) > 0){
            #   cat("\n- Forecast data.frames or matrices:\n")
            cat("\n")
            print(Forecasts)
        }
    }

    # Error message to print?
    if(length(msg) > 0){
        cat("\n")
        msg <- c(msg,"\nSee '?summary.data.list' for more information")
        # Stop or just print
        if(stopit){
            stop(pst(msg,collapse="\n"))
        }else{
            cat("ERRORS: \n",pst(msg,collapse="\n"),"\n")
        }
    }

    # Return
    invisible(list(Observations=Observations, Forecasts=Forecasts))
}


#' Compare two data.lists
#'
#' Returns TRUE if the two data.lists are fully identical, so all data, order of variables etc. must be fully identical
#' 
#' @title Determine if two data.lists are identical
#'
#' @param x first data.list  
#' @param y second data.list
#' @return logical
#'
#' @examples
#'
#' Dbuilding == Dbuilding
#'
#' D <- Dbuilding
#' D$Ta$k2[1] <- NA
#' Dbuilding == D
#'
#' D <- Dbuilding
#' names(D)[5] <- "I"
#' names(D)[6] <- "Ta"
#' Dbuilding == D
#' 
#' 
#' @export

"==.data.list" <- function(x, y) {
    if(length(x) != length(y)){
        return(FALSE)
    }
    if(any(names(x) != names(y))){
        return(FALSE)
    }
    # Check each variable
    tmp <- lapply(1:length(x), function(i){
        xi <- x[[i]]
        yi <- y[[i]]
        if(length(class(xi)) != length(class(yi))){
            return(FALSE)
        }
        if(any(class(xi) != class(yi))){
            return(FALSE)
        }
        if(is.null(dim(xi))){
            # It's a vector
            if(length(xi) != length(yi)){
                return(FALSE)
            }
        }else{
            # It's a data.frame or matrix
            if(any(dim(xi) != dim(yi))){
                return(FALSE)
            }
        }
        # Check the NA values are the same
        if(any(is.na(xi) != is.na(yi))){
            return(FALSE)
        }
        # Check the values
        all(xi == yi, na.rm=TRUE)
    })
    if(any(!unlist(tmp))){
        return(FALSE)
    }
    # All checks passed
    return(TRUE)
}