# NM2_MRS_import_v2.r

# V2 uses 'metabolite' rather than 'analyte'

# Excel data set uses empty cells for implied content: 
# The following utility fills in spaces sequentially using
# a 'previous value if missing' rule.

fill.empty <- function(x) {
  # Takes a data frame and fills in missing NON-NUMERIC values 
  # with the preceding value
  fill1 <- function(x1) {
    if(!is.numeric(x1)) {
      for(i in 2:length(x1)) if(is.na(x1[i])) x1[i] <- x1[i-1] 
    } 
    x1
  }
  
  as.data.frame(lapply(x, fill1))
}

# Data sets are complete with headers and no surplus information,
# so defaults are sufficient for raw data input
# Subject field should be factor, but automatic entry 
# reads as numeric; converted explicitly as the location in data set varies,
# making specified column types

# Pulse "Data" column is spectral intensity (see 2020-10-16_LR-SLRE.pdf)

concentrations.pp <- within(read_xls(params$concentrations.pp),
                            Subject <- factor(Subject))
concentrations.pp  <- fill.empty(concentrations.pp)

pulse.pp <- within(read_xls(params$pulse.pp),
                   Subject <- factor(Subject))
pulse.pp  <- fill.empty(pulse.pp)
names(pulse.pp)[ncol(pulse.pp)] <- "Intensity"


concentrations.nm <- within(read_xls(params$concentrations.nm),
                            Subject <- factor(Subject))
concentrations.nm  <- fill.empty(concentrations.nm)

pulse.nm <- within(read_xls(params$pulse.nm),
                   Subject <- factor(Subject))
pulse.nm  <- fill.empty(pulse.nm)
names(pulse.nm)[ncol(pulse.nm)] <- "Intensity"

# Session field includes session and replicate number. In addition,
# the subjects were repositioned between replicates in session 1 to
# allow estimations of between-location variance.
# The following code recodes Session to separate Session ID, Position (A, B) 
# and (although unnecessary for stats) Replicate within Session

decode.session <- function(x) {
  runID <- as.character( x$Session )
  x$Session <- factor(gsub("(.)_.", "\\1", runID))
  x$Replicate <- factor(gsub("._(.)", "\\1", runID))
  x$Position <- factor( ifelse(x$Session=="1" & x$Replicate=="2", "B", "A") )
  # Arrange for 'tidy' column ordering (categorical first)
  which.num <- unlist(lapply(x, is.numeric))
  cbind(x[!which.num], x[which.num])
}  

concentrations.nm <- decode.session(concentrations.nm)
pulse.nm <- decode.session(pulse.nm)
concentrations.pp <- decode.session(concentrations.pp)
pulse.pp <- decode.session(pulse.pp)

# Finally, zero concentrations should be treated as missing values - 
# mark as NA
drop.zero <- function(x) {
  dz1 <- function(x1, tol=1e-6) {
    if(!is.numeric(x1)) {
      x1  
    } else {
      ifelse(x1 < tol, NA, x1 )
    }
  }
  
  as.data.frame(lapply(x, dz1))
}

concentrations.nm <- drop.zero(concentrations.nm)
pulse.nm <- drop.zero(pulse.nm)
concentrations.pp <- drop.zero(concentrations.pp)
pulse.pp <- drop.zero(pulse.pp)

# Retain a list of metabolite column names (common to both concentration data sets)

metabolite.cols <- which( unlist(lapply(concentrations.pp, is.numeric)) )
metabolites <- names(concentrations.pp)[metabolite.cols]