# Negate
`%!in%` <- Negate(`%in%`)

#' check_file_is_valid_peaks
#' Check to make sure the input data looks like the expected PEAKS file.
#' @param peaks_data protein-peptides-lfq.csv file generated by PEAKS read
#' into R.
#' @return a stop command with a informative message if file looks unexpected.
#' otherwise, nothing.
#' @keywords internal
check_file_is_valid_peaks <- function(peaks_data) {
  # These are the names of the columns expected to be in every PEAKS file
  expected_names <- c(
    "Protein Group", "Protein ID", "Protein Accession", "Peptide",
    "Used", "Candidate", "Quality", "Significance", "Avg. ppm", "Avg. Area",
    "Sample Profile (Ratio)", "Group Profile (Ratio)", "Max Ratio", "#Vector",
    "Start", "End", "PTM"
  )
  names_in_data <- names(peaks_data)
  # What names are missing
  missing_names <- expected_names[expected_names %!in% names_in_data]
  if (length(missing_names) > 1) {
    stop(
      paste(c("This doesn't look like the expected PEAKS file. The following
          columns are missing:", missing_names), collapse = "")
    )
  }
  # PEAKS files with PTMS are not supported
  if (sum(!is.na(peaks_data$PTM)) > 1) {
    stop("There are PTMS in your file, redo PEAKS search without
    PTMs. PTM analyis is not currently supported")
  }
}

#' check_file_is_valid_fragpipe
#' Check to make sure the input data looks like the expected FragPipe file.
#' @param fragpipe_data combined_peptide.tsv file generated by FragPipe read
#' into R.
#'
#' @return a stop command with a informative message if file looks unexpected.
#' otherwise, nothing.
check_file_is_valid_fragpipe <- function(fragpipe_data) {
  # These are the names of the columns expected to be in every fragpipe file
  expected_names <- c(
    "Peptide Sequence", "Prev AA", "Next AA", "Start", "End", "Peptide Length",
    "Charges", "Protein", "Protein ID", "Entry Name", "Gene",
    "Protein Description", "Mapped Genes", "Mapped Proteins"
  )

  names_in_data <- names(fragpipe_data)

  # What names are missing
  missing_names <- expected_names[expected_names %!in% names_in_data]
  if (length(missing_names) > 1) {
    stop(
      paste(c("This doesn't look like the expected fragpipe file. The following
          columns are missing:", missing_names), collapse = " ")
    )
  }
}

#' check_file_is_valid_pd
#' Check to make sure the input data looks like the expected ProteomeDiscoverer
#' file.
#' @param pd_data PeptideGroups.txt file generated by ProteomeDiscover and read
#' into R.
#'
#' @return a stop command with a informative message if file looks unexpected.
#' otherwise, nothing.
check_file_is_valid_pd <- function(pd_data) {
  # These are the names of the columns expected to be in every PEAKS file
  expected_names <- c(
    "Peptide Groups Peptide Group ID", "Checked", "Confidence",
    "Annotated Sequence", "Modifications", "Qvality PEP",
    "Qvality q-value", "# Protein Groups", "# Proteins",
    "# PSMs", "Master Protein Accessions",
    "Positions in Master Proteins",
    "Modifications in Master Proteins",
    "# Missed Cleavages"
  )
  names_in_data <- names(pd_data)
  # What names are missing
  missing_names <- expected_names[expected_names %!in% names_in_data]
  if (length(missing_names) > 1) {
    stop(
      paste(c("This doesn't look like the expected PEAKS file. The following
          columns are missing:", missing_names), collapse = "")
    )
  }
}

#' cterm_cleavage
#'
#' Finding the cleavage sequences on the C terminus of a given peptide in
#' reference to the peptide library it was derived from
#'
#' @param peptide_sequence  the peptide sequence represented in single letter
#' code. "_" denotes cleavage site.
#' @param library_match_sequence  the sequence the peptide matches to with the
#' proteomics search software used. Note, this may not be the true sequence of
#' the peptide depending on how the library was constructed. For example, in
#' the standard MSP-MS 228 member library, methionine has been replaced with
#' norleucine (n). This was done because norleucine looks like methionine to a
#' protease, but it cannot be oxidized. Norleucine's (n) mass is the same as
#' leucine (L), so it is recognized by the proteomics software as L.
#' @param library_real_sequence  the sequence the peptide truly is. In the
#' standard MSP_MS 228 member library, some of the amino acids recognize as
#' leucine (L) are truly Norleucine (n).
#'
#' @param n_residues the number of residues to the left and right of
#' the cleavage event to return

#' @return a tibble with the peptide sequence, cleavage sequences (converted
#' from the matching to real sequence), with n number
#' of amino acids to the left and right of the c term cleavage, and the
#' position of the c-term cleavage in the library sequence
#' @keywords internal
cterm_cleavage <- function(peptide_sequence,
                           library_match_sequence,
                           library_real_sequence,
                           n_residues = 4) {
  n_of_match <- length(gregexpr("_", peptide_sequence)[[1]])
  if ((grepl("_", peptide_sequence) == TRUE) &&
    (gregexpr("_", peptide_sequence)[[1]][n_of_match] ==
      nchar(peptide_sequence) - 1)) {
    # if there is a c term cleavage, it is the last position - 1 e
    pos <- nchar(peptide_sequence) - 1
    # Defining the sequence on the left side of the cleavage
    temp <- substr(peptide_sequence, 1, pos - 1)
    # taking care of case where there are two _
    temp <- gsub("_", "", temp)
    # modify the sequence to contain max n of Xs possible on both sides
    n_x <- paste0(rep("X", times = n_residues), collapse = "")
    x_mod_match <- paste0(n_x, library_match_sequence, n_x, collapse = "")
    x_mod_real <- paste0(n_x, library_real_sequence, n_x, collapse = "")
    # Checking to see what part of the reference sequence this matches.
    left_reference_beginning <- regexpr(temp, x_mod_match)[[1]][1]
    left_side_length <- nchar(temp)
    # Where did the peptide get cleaved?
    cleavage_pos <- (left_reference_beginning + left_side_length) - 1
    left_sequence <- substr(x_mod_real,
      start = (cleavage_pos - n_residues + 1),
      cleavage_pos
    )
    right_sequence <- substr(x_mod_real,
      start = (cleavage_pos + 1),
      cleavage_pos + n_residues
    )
    cterm <- paste(c(left_sequence, right_sequence), collapse = "")
    real_cleavage_pos <- (regexpr(temp, library_match_sequence)[[1]][1] +
      left_side_length) - 1
  } else {
    cterm <- NA
    real_cleavage_pos <- NA
  }
  output <- tibble::tibble(
    peptide = peptide_sequence,
    cterm = cterm, cterm_cleavage_pos = real_cleavage_pos
  )
  return(output)
}

#' nterm_cleavage
#'
#' Finding the cleavage sequences on the N terminus of a given peptide in
#' reference to the peptide library it was derived from.
#'
#' @param peptide_sequence the peptide sequence represented in single letter
#' code. "_" denotes cleavage site.
#' @param library_match_sequence  the sequence the peptide matches to with the
#' proteomics search software used. Note, this may not be the true sequence of
#' the peptide depending on how the library was constructed. For example, in
#' the standard MSP-MS 228 member library, methionine has been replaced with
#' norleucine (n). This was done because norleucine looks like methionine to a
#' protease, but it cannot be oxidized. Norleucine's (n) mass is the same as
#' leucine (L), so it is recognized by the proteomics software as L.
#' @param library_real_sequence the sequence the peptide truly is. In the
#' standard MSP_MS 228 member library, some of the amino acids recognize as
#' leucine (L) are truly Norleucine (n).
#' @param n_residues the number of residues to the left and right of
#' the cleavage event to return.
#'
#' @return a tibble with the peptide sequence, cleavage sequences n specified
#'  number of AA on the left and right of the n term cleavage, and the position
#'   of the n term cleavage in the library sequence.
#' @keywords internal

nterm_cleavage <- function(peptide_sequence,
                           library_match_sequence,
                           library_real_sequence,
                           n_residues = 4) {
  # _ denotes a cleavage, and if it is the second position, it is on the n
  if ((grepl("_", peptide_sequence) == TRUE) &&
    (regexpr("_", peptide_sequence)[[1]][1] == 2)) {
    # The first letter of the right side is the third letter our sequence
    pos <- 2 + 1
    # taking the sequence from right after the _ to the beginning.
    temp <- substr(peptide_sequence, pos, nchar(peptide_sequence))
    # taking care of case where there are two _
    temp <- gsub("_", "", temp)
    # modify the sequence to contain max n of Xs possible on both sides
    n_x <- paste0(rep("X", times = n_residues), collapse = "")
    x_mod_match <- paste0(n_x, library_match_sequence, n_x, collapse = "")
    x_mod_real <- paste0(n_x, library_real_sequence, n_x, collapse = "")
    # Checking to see what part of the reference sequence this matches.
    cleavage_pos <- regexpr(temp, x_mod_match)[[1]][1] - 1
    right_sequence <- substr(
      x_mod_real, cleavage_pos + 1,
      cleavage_pos + n_residues
    )
    left_sequence <- substr(
      x_mod_real,
      start = cleavage_pos - n_residues + 1,
      stop = cleavage_pos
    )
    nterm <- paste(c(left_sequence, right_sequence), collapse = "")
    real_cleavage_pos <- regexpr(temp, library_match_sequence)[[1]][1] - 1
  } else {
    nterm <- NA
    real_cleavage_pos <- NA
  }
  output <- tibble::tibble(
    peptide = peptide_sequence,
    nterm = nterm,
    nterm_cleavage_pos = real_cleavage_pos
  )
  return(output)
}

#' add_cleavages
#'
#' Adds cleavage information to a tibble by wraping the n_term_cleavage
#' and c_term_cleavage functions into a consolidated function.
#'
#' @param joined_with_library  a tibble containing columns named "peptide",
#' "library_match_sequence", and "library_real_sequence".
#' @param n_residues the number of residues to the left and right of the
#'  cleavage site to include in the output.
#' @return a tibble with cleavage information added.
#' @keywords internal
add_cleavages <- function(joined_with_library, n_residues = 4) {
  # Iterating through and applying nterm_clevage
  nterm <- purrr::pmap_df(
    list(
      joined_with_library$peptide,
      joined_with_library$library_match_sequence,
      joined_with_library$library_real_sequence,
      n_residues
    ),
    nterm_cleavage
  )
  # Iterating though and applying cterm_cleavage
  cterm <- purrr::pmap_df(
    list(
      joined_with_library$peptide,
      joined_with_library$library_match_sequence,
      joined_with_library$library_real_sequence,
      n_residues
    ),
    cterm_cleavage
  )
  # Combining nterm and cterm
  cleavages <- dplyr::bind_cols(nterm, cterm[, 2:3])
  joined_with_library <- dplyr::select(joined_with_library, -"peptide")
  # Building final data frame.
  output <- dplyr::bind_cols(cleavages, joined_with_library)
  return(output)
}

#' consolidate_cleavages
#'
#' Consolidate the n term and c term cleavage data. The nterm and cterm
#' cleavage information  are consolidated into a single column and rows
#  that have both nterm and cterm cleavage information are removed.
#'
#' @param cleavage_added_data a tibble where cleavage information has
#' been added by add_cleavages()
#'
#' @return a tibble with the cleavage information combined into a single column
#'  and rows with no cleavage information or double information removed.
#' @keywords internal
consolidate_cleavages <- function(cleavage_added_data) {
  out <- cleavage_added_data %>%
    # consolidating cleavage sequence
    dplyr::mutate(cleavage_seq = dplyr::case_when(
      !is.na(.data$nterm) & is.na(.data$cterm) ~ .data$nterm,
      !is.na(.data$cterm) & is.na(.data$nterm) ~ .data$cterm,
      TRUE ~ NA
    ), .after = "cterm_cleavage_pos") %>%
    # Removing peptides with double cleavages
    dplyr::filter(!(!is.na(.data$cterm) & !is.na(.data$nterm))) %>%
    dplyr::mutate(cleavage_pos = dplyr::case_when(
      is.na(.data$cterm_cleavage_pos) ~ .data$nterm_cleavage_pos,
      TRUE ~ .data$cterm_cleavage_pos
    ), .after = "cleavage_seq") %>%
    # Adding character specifying what are cleaved vs not
    dplyr::mutate(
      peptide_type = dplyr::case_when(
        is.na(.data$cleavage_pos) ~ "full_length",
        TRUE ~ "cleavage_product"
      ),
      .after = "cleavage_pos"
    ) %>%
    dplyr::select(
      -"nterm", -"cterm", -"nterm_cleavage_pos",
      -"cterm_cleavage_pos"
    )
  return(out)
}
#' convert prepared data to a QFeatures object
#'
#' @param prepared_data data prepared within one of the prepare functions
#' @param colData  sample metadata
#' @param peptide_library the peptide library used.
#' @param n_residues the number of residues reported in the cleavage site
#'
#' @return a QFeatures object
#' @keywords internal
prepared_to_qf <- function(prepared_data,
                           colData,
                           peptide_library = mspms::peptide_library,
                           n_residues = 4) {
  #adding peptide length to prepared_data
  prepared_data <- prepared_data %>%
    dplyr::mutate(peptide_length = gsub("^._","",.data$peptide),
                  peptide_length = nchar(gsub("_.$","",.data$peptide_length)),
                  .after = .data$peptide) 
  # check peptide library for correct names.
  check_peptide_library(peptide_library)
  # Making sure that prepared_data and peptide library are consistent
  peptide_library_ids <- peptide_library$library_id
  peptide_library_ids_data <- unique(prepared_data$library_id)
  missing <- peptide_library_ids_data[peptide_library_ids_data %!in%
    peptide_library_ids]
  missing_mes <- paste0(missing, collapse = ",")
  if (length(missing > 1)) {
    stop("There are peptide library ids in your data that are not in your
         peptide library. Specificially ", missing_mes, "are missing from your
         peptide library.")
  }
  # combining peptide sequences
  combined <- dplyr::inner_join(peptide_library, prepared_data,
    by = "library_id"
  ) %>%
    add_cleavages(n_residues = n_residues) %>%
    consolidate_cleavages()
  name_in_prepared_data <- names(prepared_data)[3:length(prepared_data)]
  n_coldata_nin_prepared_data <- sum(colData$quantCols %!in%
    name_in_prepared_data)
  missing_name <- paste0(
    name_in_prepared_data[
      colData$quantCols %!in% name_in_prepared_data
    ],
    collapse = " "
  )
  if (n_coldata_nin_prepared_data > 0) {
    stop(
      "the quantCol names in your colData do not match those in your",
      " proteomics data. Specifically the column(s) ", missing_name,
      " are present in your proteomics data but not in your",
      " colData"
    )
  }
  QF <- QFeatures::readQFeatures(combined,
    quantCols = 9:length(combined),
    fnames = "peptide",
    colData = colData,
    name = "peptides"
  )
  return(QF)
}

#' load_colData
#'
#' load a .csv file containing sample colData. Check for errors
#'
#' @param colData_filepath filepath to .csv file containing colData.
#'
#' @return a tibble
#' @keywords internal

load_colData <- function(colData_filepath) {
  colData <- readr::read_csv(colData_filepath)
  # Expected names
  `%!in%` <- Negate(`%in%`)
  expected_names <- c("quantCols", "group", "condition", "time")
  colData_names <- names(colData)
  unexpected_names <- expected_names %!in% colData_names
  n_unexpected_names <- sum(unexpected_names)
  if (n_unexpected_names > 0) {
    missing_names <- expected_names[unexpected_names]
    stop(paste0(c("colData must have columns named \"quantCols\",\"group\",
    \"condition\", and \"time\" ", "you are missing ", paste(missing_names,
                                                             collapse = ", "))),
         collapse = ""
    )
  }
  # Check if the column types match
  expected_types <- c("character", "character", "character", "numeric")
  colData_types <- sapply(colData, class)
  unexpected_types <- (colData_types != expected_types[1:4])
  n_unexpected_types <- sum(unexpected_types)
  
  if (n_unexpected_types > 0) {
    missing_types <- names(colData)[unexpected_types]
    stop(paste0("colData columns must be as follows: quantCols = character,
    group = character, condition = character, time = numeric.",
                "You have the wrong type for the following column(s): ",
                paste(missing_types, collapse = ", ")))
  }
  # Check to make sure none of the columns have spaces in them
  n_spaces <- sum(sapply(colData, function(x){grepl(".* .*",x)}))
  if(n_spaces > 0){
    stop("Your colData has spaces in it (ie condition = Cathepsin A). Please
    change it to not contain any spaces to be compatible with limma (ie
    condition  = cathepsin_A)")
    }
  return(colData)
}
  
#' check_peptide_library
#'
#' @param peptide_library
#'
#' @return an informative error if the column names of the peptide library are
#' unexpected. Otherwise nothing.
#' @keywords internal

check_peptide_library <- function(peptide_library) {
  pl_names <- names(peptide_library)
  if(!identical(pl_names, c("library_id",
                 "library_match_sequence",
                 "library_real_sequence")
                )) {
    stop("the first three columns of the peptide library .csv are not as
         expected. They must be library_id, library_match_sequence, and
         library_real_sequence")
  }
}
