#' Read Kraken-style and MetaPhlAn reports
#'
#' @param myfile Kraken-style or MetaPhlAn report file.
#' @param has_header If the kraken report has a header or not.
#' @param check_file If TRUE, only the first 5 lines of the file are loaded.
#'
#' @return report data.frame
#' @export
#'
read_report <- function(myfile, has_header = NULL, check_file = FALSE) {
# TODO: Support for gzipped files .. myfile <- file(myfile) file_class <-
# summary(myfile)$class if (file_class == 'gzfile') myfile <- gzcon(myfile)
first.line <- tryCatch(readLines(myfile, n = 1, warn = FALSE), error = function(e) {
warning("Error reading ", myfile)
return()
})
isASCII <- function(txt) {
if (length(txt) == 0)
return(FALSE)
raw <- charToRaw(txt)
all(raw <= as.raw(127) && (raw >= as.raw(32) | raw == as.raw(9)))
}
if (length(first.line) == 0) {
message("Could not read ", myfile, ".")
return(NULL)
}
tryCatch({
if (nchar(first.line) == 0) {
message("First line of ", myfile, " is empty")
return(NULL)
}
}, error = function(e) {
message(e)
return(NULL)
})
if (!isTRUE(isASCII(first.line))) {
message(myfile, " is not a ASCII file")
return(NULL)
}
if (is.null(has_header)) {
has_header <- grepl("^[a-zA-Z#%\"]", first.line)
}
is_krakenu_fmt <- grepl("^.?%\treads\ttaxReads\tkmers", first.line)
is_kaiju_fmt <- grepl("^ *%\t *reads", first.line)
nrows <- ifelse(isTRUE(check_file), 5, -1)
if (!is_krakenu_fmt && is_kaiju_fmt) {
cont <- readLines(myfile)
cont <- cont[!grepl("^-", cont)]
cont <- sub(".*\t *", "", cont)
cont <- sub("; ?$", "", cont)
report <- utils::read.delim(textConnection(cont), stringsAsFactors = FALSE)
colnames(report) <- c("taxonReads", "taxLineage")
report$cladeReads <- report$taxonReads
report$taxLineage <- gsub("^", "-_", report$taxLineage)
report$taxLineage <- gsub("; ", "|-_", report$taxLineage)
report$taxLineage <- gsub("-_Viruses", "d_Viruses", report$taxLineage, fixed = T)
report$taxLineage <- gsub("-_cellular organisms|-_Bacteria", "-_cellular organisms|d_Bacteria",
report$taxLineage, fixed = T)
report$taxLineage <- gsub("-_cellular organisms|-_Eukaryota", "-_cellular organisms|d_Eukaryota",
report$taxLineage, fixed = T)
report$taxLineage <- gsub("-_cellular organisms|-_Archaea", "-_cellular organisms|d_Archaea",
report$taxLineage, fixed = T)
report$taxLineage[1:(length(report$taxLineage) - 1)] <- paste0("-_root|", report$taxLineage[1:(length(report$taxLineage) -
1)])
report$taxLineage[report$taxLineage == "-_unclassified"] <- "u_unclassified"
new_counts <- integer(length = 0)
for (j in seq_len(nrow(report))) {
count <- report$cladeReads[j]
tl <- report$taxLineage[j]
tl2 <- sub("\\|[^|]*$", "", tl)
while (tl2 != tl) {
if (tl2 %in% names(new_counts)) {
new_counts[tl2] <- new_counts[tl2] + count
} else {
new_counts[tl2] <- count
}
tl <- tl2
tl2 <- sub("\\|[^|]*$", "", tl)
}
}
report <- rbind(report, data.frame(taxonReads = 0, taxLineage = names(new_counts), cladeReads = as.integer(new_counts)))
tl_order <- order(report$taxLineage)
tl_order <- c(tl_order[length(tl_order)], tl_order[-length(tl_order)])
report <- report[tl_order, c("taxLineage", "taxonReads", "cladeReads")]
} else if (has_header) {
report <- tryCatch({
utils::read.table(myfile, sep = "\t", header = T, quote = "", stringsAsFactors = FALSE,
comment.char = "#", nrows = nrows, check.names = FALSE)
}, error = function(x) NULL, warning = function(x) NULL)
if (is.null(report)) {
return(NULL)
}
# colnames(report) <-
# c('percentage','cladeReads','taxonReads','taxRank','taxID','n_unique_kmers','n_kmers','perc_uniq_kmers','name')
## harmonize column names. TODO: Harmonize them in the scripts!
colnames(report)[colnames(report) %in% c("#%", "%", "clade_perc", "perc", "percReadsClade")] <- "percentage"
colnames(report)[colnames(report) %in% c("reads", "numReadsClade", "n_reads_clade",
"n.clade", "n-clade")] <- "cladeReads"
colnames(report)[colnames(report) %in% c("taxReads", "numReadsTaxon", "n_reads_taxo",
"n.stay", "n-stay")] <- "taxonReads"
colnames(report)[colnames(report) %in% c("rank", "tax_taxRank", "level")] <- "taxRank"
colnames(report)[colnames(report) %in% c("tax", "taxonid")] <- "taxID"
colnames(report)[colnames(report) %in% c("indentedName", "taxName")] <- "name"
colnames(report)[colnames(report) %in% c("dup")] <- "kmerDuplicity"
colnames(report)[colnames(report) %in% c("cov")] <- "kmerCoverage"
} else {
report <- tryCatch({
utils::read.table(myfile, sep = "\t", header = F, col.names = c("percentage", "cladeReads",
"taxonReads", "taxRank", "taxID", "name"), quote = "", stringsAsFactors = FALSE,
nrows = nrows)
}, error = function(x) NULL, warning = function(x) NULL)
if (is.null(report)) {
return(NULL)
}
}
if (ncol(report) < 2) {
return(NULL)
}
if (colnames(report)[2] == "Metaphlan2_Analysis") {
## Metaphlan report
colnames(report) <- c("taxLineage", "cladeReads")
report <- report[order(report$taxLineage), ]
report$taxLineage <- gsub("_", " ", report$taxLineage)
report$taxLineage <- gsub(" ", "_", report$taxLineage)
report$taxLineage <- paste0("-_root|", report$taxLineage)
report <- rbind(data.frame(taxLineage = c("u_unclassified", "-_root"), cladeReads = c(0,
100), stringsAsFactors = F), report)
}
if (all(c("name", "taxRank") %in% colnames(report)) && !"taxLineage" %in% colnames(report)) {
## Kraken report
report$depth <- nchar(gsub("\\S.*", "", report$name))/2
if (!all(report$depth == floor(report$depth))) {
warning("Depth doesn't work out!")
return(NULL)
}
report$name <- gsub("^ *", "", report$name)
## 'fix' taxRank
table(report$taxRank)
allowed_taxRanks <- c("U", "S", "G", "F", "C", "D", "O", "K", "P")
report$taxRank[report$taxRank == "class"] <- "C"
report$taxRank[report$taxRank == "family"] <- "F"
report$taxRank[report$taxRank == "genus"] <- "G"
report$taxRank[report$taxRank == "superkingdom"] <- "D"
report$taxRank[report$taxRank == "kingdom"] <- "K"
report$taxRank[report$taxRank == "order"] <- "O"
report$taxRank[report$taxRank == "phylum"] <- "P"
report$taxRank[report$taxRank == "species"] <- "S"
report$taxRank[report$name == "unclassified"] <- "U"
report$taxRank[!report$taxRank %in% allowed_taxRanks] <- "-"
report$name <- paste(tolower(report$taxRank), report$name, sep = "_")
rownames(report) <- NULL
## make taxLineage path
report$taxLineage <- report$name
n <- nrow(report)
depths <- report$depth
taxLineages <- report$name
taxLineages_p <- as.list(seq_along(report$name))
depth_row_tmp <- c(1:25)
for (current_row in seq(from = 1, to = nrow(report))) {
dcr <- depths[current_row]
depth_row_tmp[dcr + 1] <- current_row
if (dcr >= 1) {
prev_pos <- depth_row_tmp[[dcr]]
taxLineages_p[[current_row]] <- c(taxLineages_p[[prev_pos]], current_row)
}
}
report$taxLineage <- sapply(taxLineages_p, function(x) paste0(taxLineages[x], collapse = "|"))
# report$taxLineage <- taxLineages
} else if ("taxLineage" %in% colnames(report)) {
taxLineages <- strsplit(report$taxLineage, "|", fixed = TRUE)
if (!"name" %in% colnames(report))
report$name <- sapply(taxLineages, function(x) x[length(x)])
if (!"depth" %in% colnames(report)) {
report$depth <- sapply(taxLineages, length) - 1
}
if (!"taxRank" %in% colnames(report))
report$taxRank <- toupper(substr(report$name, 0, 1))
}
if (!all(c("name", "taxRank") %in% colnames(report)) || nrow(report) < 2 || !((report[1,
"name"] == "u_unclassified" && report[2, "name"] == "-_root") || report[1, "name"] ==
"-_root")) {
message(paste("Warning: File", myfile, "does not have the required format"))
print(utils::head(report))
return(NULL)
}
if (!"taxonReads" %in% colnames(report)) {
parent <- sub("^\\(.*\\)\\|.*$", "\\1", report$taxLineage)
taxLineages <- strsplit(report$taxLineage, "|", fixed = TRUE)
## fix taxonReads
report$taxonReads <- report$cladeReads - sapply(report$name, function(x) sum(report$cladeReads[parent ==
x]))
# report$taxonReads[sapply(report$taxonReads, function(x) isTRUE(all.equal(x, 0)))] <- 0
report$taxonReads[report$taxonReads <= 1e-05] <- 0 # fix for rounding in percentages by MetaPhlAn
}
report$percentage <- signif(report$cladeReads/sum(report$taxonReads), 6) * 100
if ("n_unique_kmers" %in% colnames(report))
report$kmerpercentage <- round(report$n_unique_kmers/sum(report$n_unique_kmers, na.rm = T),
6) * 100
# report$taxRankperc <- 100/taxRank(report$cladeReads)
# report$depth <- NULL
if ("taxID" %in% colnames(report)) {
std_colnames <- c("percentage", "cladeReads", "taxonReads", "taxRank", "taxID", "name")
} else {
std_colnames <- c("percentage", "cladeReads", "taxonReads", "taxRank", "name")
}
stopifnot(all(std_colnames %in% colnames(report)))
report[, c(std_colnames, setdiff(colnames(report), std_colnames))]
}
#' Read kraken or centrifuge-style report
#'
#' @param myfile kraken report file
#' @param collapse should the results be collapsed to only those taxRanks specified in keep_taxRanks?
#' @param keep_taxRanks taxRanks to keep when collapse is TRUE
#' @param min.depth minimum depth
#' @param filter_taxon filter certain taxon names
#' @param has_header if the kraken report has a header or not
#' @param add_taxRank_columns if TRUE, for each taxRank columns are added
#'
#' @return report data.frame
#' @export
#'
read_report2 <- function(myfile, collapse = TRUE, keep_taxRanks = c("D", "K", "P", "C", "O",
"F", "G", "S"), min.depth = 0, filter_taxon = NULL, has_header = NULL, add_taxRank_columns = FALSE) {
first.line <- readLines(myfile, n = 1)
isASCII <- function(txt) all(charToRaw(txt) <= as.raw(127))
if (!isASCII(first.line)) {
message(myfile, " is no valid report - not all characters are ASCII")
return(NULL)
}
if (is.null(has_header)) {
has_header <- grepl("^[a-zA-Z]", first.line)
}
if (has_header) {
report <- utils::read.table(myfile, sep = "\t", header = T, quote = "", stringsAsFactors = FALSE,
comment.char = "#")
# colnames(report) <-
# c('percentage','cladeReads','taxonReads','taxRank','taxID','n_unique_kmers','n_kmers','perc_uniq_kmers','name')
## harmonize column names. TODO: Harmonize them in the scripts!
colnames(report)[colnames(report) == "clade_perc"] <- "percentage"
colnames(report)[colnames(report) == "perc"] <- "percentage"
colnames(report)[colnames(report) == "n_reads_clade"] <- "cladeReads"
colnames(report)[colnames(report) == "n.clade"] <- "cladeReads"
colnames(report)[colnames(report) == "n_reads_taxo"] <- "taxonReads"
colnames(report)[colnames(report) == "n.stay"] <- "taxonReads"
colnames(report)[colnames(report) == "rank"] <- "taxRank"
colnames(report)[colnames(report) == "tax_rank"] <- "taxRank"
colnames(report)[colnames(report) == "taxonid"] <- "taxID"
colnames(report)[colnames(report) == "tax"] <- "taxID"
} else {
report <- utils::read.table(myfile, sep = "\t", header = F, col.names = c("percentage",
"cladeReads", "taxonReads", "taxRank", "taxID", "name"), quote = "", stringsAsFactors = FALSE,
comment.char = "#")
}
report$depth <- nchar(gsub("\\S.*", "", report$name))/2
report$name <- gsub("^ *", "", report$name)
report$name <- paste(tolower(report$taxRank), report$name, sep = "_")
## Only stop at certain taxRanks filter taxon and further up the tree if 'filter_taxon' is
## defined
kraken.tree <- build_kraken_tree(report)
report <- collapse.taxRanks(kraken.tree, keep_taxRanks = keep_taxRanks, filter_taxon = filter_taxon)
## Add a metaphlan-style taxon string
if (add_taxRank_columns) {
report[, keep_taxRanks] <- NA
}
report$taxLineage = report$name
rows_to_consider <- rep(FALSE, nrow(report))
for (i in seq_len(nrow(report))) {
## depth > 2 correspond to taxRanks below 'D'
if (i > 1 && report[i, "depth"] > min.depth) {
## find the maximal index of a row below the current depth
idx <- report$depth < report[i, "depth"] & rows_to_consider
if (!any(idx)) {
(next)()
}
current.taxRank <- report[i, "taxRank"]
my_row <- max(which(idx))
report[i, "taxLineage"] <- paste(report[my_row, "taxLineage"], report[i, "taxLineage"],
sep = "|")
if (add_taxRank_columns) {
if (report[my_row, "taxRank"] %in% keep_taxRanks) {
taxRanks.cp <- keep_taxRanks[seq(from = 1, to = which(keep_taxRanks == report[my_row,
"taxRank"]))]
report[i, taxRanks.cp] <- report[my_row, taxRanks.cp]
}
report[i, report[i, "taxRank"]] <- report[i, "name"]
}
}
rows_to_consider[i] <- TRUE
}
report <- report[report$depth >= min.depth, ]
report$percentage <- round(report$cladeReads/sum(report$taxonReads), 6) * 100
for (column in c("taxonReads", "cladeReads")) if (all(floor(report[[column]]) == report[[column]]))
report[[column]] <- as.integer(report[[column]])
if ("n_unique_kmers" %in% colnames(report))
report$kmerpercentage <- round(report$n_unique_kmers/sum(report$n_unique_kmers, na.rm = T),
6) * 100
# report$taxRankperc <- 100/taxRank(report$cladeReads)
rownames(report) <- NULL
report
}
#' Filter lines from a kraken report result based on the taxonomy name
#'
#' It updates the read_stay counts, and removes any children below the
#' entry, and any parent entries that have no 'cladeReads' that stay
#'
#' @param report Report \code{data.frame}.
#' @param filter_taxon Name of entry to remove.
#' @param rm_clade If \code{TRUE}, remove all cladeReads at and below clade, otherwise just set the number of cladeReads that stay at taxon to zero.
#' @param do_message If \code{TRUE}, report how many rows and cladeReads were deleted.
#'
#' @return filtered report
#' @export
filter_taxon <- function(report, filter_taxon, rm_clade = TRUE, do_message = FALSE) {
taxon_depth <- NULL
taxonReads <- 0
pos.taxons <- which(sub("._", "", report$name) %in% filter_taxon)
# pos.taxon <- which(report$name := filter_taxon)
if (length(pos.taxons) == 0) {
return(report)
}
row_seq <- seq_len(nrow(report))
rows_to_delete <- rep(FALSE, nrow(report))
taxon_depths <- report[pos.taxons, "depth"]
if (isTRUE(rm_clade)) {
taxonReads <- report[pos.taxons, "cladeReads"]
} else {
taxonReads <- report[pos.taxons, "taxonReads"]
report[pos.taxons, "taxonReads"] <- 0
}
for (i in seq_along(pos.taxons)) {
pos.taxon <- pos.taxons[i]
if (pos.taxon == 1) {
rows_to_delete[1] <- TRUE
next
}
taxon_depth <- taxon_depths[i]
taxonReads <- taxonReads[i]
if (rm_clade) {
tosum_below <- row_seq >= pos.taxon & report$depth <= taxon_depth
taxons_below <- cumsum(tosum_below) == 1
rows_to_delete[taxons_below] <- TRUE
}
rows_to_update <- c(pos.taxon)
taxons_above <- seq_len(nrow(report)) < pos.taxon & report$depth == taxon_depth
any_stays <- FALSE
prev_taxon_depth <- taxon_depth
taxons_above <- c()
for (i in seq(from = (pos.taxon - 1), to = 1)) {
curr_taxon_depth <- report[i, "depth"]
if (curr_taxon_depth < prev_taxon_depth) {
if (!any_stays) {
if (report[i, "cladeReads"] == taxonReads) {
rows_to_delete[i] <- TRUE
if (do_message)
message("Deleting ", report[i, "name"])
} else {
any_stays <- TRUE
}
}
if (!rows_to_delete[i]) {
rows_to_update <- c(rows_to_update, i)
if (do_message)
message("Updating ", report[i, "name"])
}
prev_taxon_depth <- curr_taxon_depth
} else {
any_stays <- TRUE
}
}
report[rows_to_update, "cladeReads"] <- report[rows_to_update, "cladeReads"] - taxonReads
}
# if (rm_clade)
report[!rows_to_delete, ]
# else report
}