-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNucl_calibration_fcn.R
266 lines (211 loc) · 10.1 KB
/
Nucl_calibration_fcn.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
##### Functions for Calibration samples - Nucleotides -------------------
#' @title Import files
#'
#' @description \code{import_files} imports all files exported vie TOPPAS pipeline stored
#' at a defined input folder.
#' It also takes care of required modificiations of the input data, e.g., extraction
#' of file names and renaming column names.
#'
#' @param path_files Character.
#' @param mode Character; define either as "samples" or "cal" for calibration files
#' @param conv_list Data frame; containing conversion information, e.g., nuc_nb into nuc_id
#' @param condition_list Data frame: containing concentration level for each calibration curve,
#' required for the mode + "calibrations". Default value - NULL.
#' @param plot_graph TRUE / FALSE for the graphical output
#'
#' @return The function returns a dataframe summarising all input information provided in the
#' single input files
#'
#' @examples import_files("input/samples/", mode = "samples")
#' @examples import_files("input/cal/", mode = "cal")
#'
#' @export
import_files <- function(path_files, mode, conv_list, condition_list, plot_graph = FALSE){
dataframe = list.files(path_files, pattern = ".unknown")
l = list()
temp = list()
for (i in 1:length(dataframe)) {
#import *.unknown file
data = read.table(paste0(path_files, dataframe[i]), header = TRUE)
#clean up header
colnames(data)[grepl("median", colnames(data))] = "median_intensity"
colnames(data)[grepl("nuc", colnames(data))] = "nuc_nb"
#### transformation for data == calibration --------------------------------
if (mode == "cal") {
#extract: what NucleoMix
data$calcurve = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[4]
#extract: replicate calcurve
temp = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[5]
data$repl_calcurve = as.numeric(unlist(strsplit(temp, split = ".", fixed = TRUE))[1])
#extract: date
data$date = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[1]
#merge: nuc_nb matching up with nuc_id
data_conv = merge(data, conv_list)
#merge: calibration curve concentrations
data_conv = merge(data_conv, condition_list, all.x = TRUE)
#transform: transition (numeric) into factor()
data_conv = ddply(data_conv, c("nuc_id", "transition"), transform,
transition_id = paste0("t", transition))
#select dataframe columns
data_conv = data_conv[, c("nuc_id", "nuc","nuc_group" ,"transition_id", "calcurve", "level" ,"repl_calcurve",
"date", "median_intensity", "cv")]
}
#### transformation for data == samples ------------------------------------
if (mode == "samples") {
#extract:
data$file_tag = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[3]
data$date = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[1]
#merge: sample group annotaion
data_conv = merge(data, conv_list, all.x = TRUE)
#merge: add annotation
data_conv = merge(data_conv, condition_list, all.x = TRUE)
#transform: transition (numeric) into factor()
data_conv = ddply(data_conv, c("nuc_id", "transition"), transform,
transition_id = paste0("t", transition))
#select dataframe columns
data_conv = data_conv[, c("date","file_tag", "sample", "nuc_id", "nuc", "transition_id",
"median_intensity", "cv")]
}
#write list and transform into a dataframe
l[[i]] = data_conv
}
data_export = do.call(rbind, l)
#check matching annotation file (empty file)
if (nrow(data_export) == 0) {
message("ERROR: Check annotation file. Empty dataframe created!")
} else {
message("Checked - successful merge annotation file and input data!")
}
#export into .csv file
temp_extr = unlist(strsplit(path_files, split = "/", fixed = TRUE))[2]
write.csv(data_export, paste0("output/", temp_extr, "_export.csv"), row.names = FALSE)
message("Done - input data exported as .csv-file: ",temp_extr)
if (plot_graph == TRUE) {
for (var in unique(data_export$nuc_group)) {
print(
ggplot(subset(data_export, data_export$nuc_group == var & data_export$calcurve != "TrueBlank"),
aes(x = level, y = median_intensity, colour = date)) +
geom_point() +
facet_grid(nuc~transition_id, scales = "free_y") +
theme_bw() +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 8)) +
ggtitle(paste0("Nucleotide (Calibration): ", var)) +
theme(strip.text = element_text(size = 8),
strip.background = element_rect(fill = "white")) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE),
breaks = NULL) +
scale_color_manual(values = c("dodgerblue3", "black", "red"))
)
}
}
return(data_export)
}
#' Evaluation of calibration curves
#'
#' Evaluation of calibration curves for each nucleotide and defined transitions.
#' This function checks the following parameter: (i) noise level and (ii) saturation.
#' Corresponding tags are evaluated and added to the data frame, that is also exported
#' as a .csv file at the end of the function.
#'
#' Generated parameter and tags are explained in greater detail below.
#'
#' @param cal_data Dataframe, in ideal case generated by the function import_files(mode = "cal")
#' @param path_files path, defining export folder within the .Rproj-folder
#'
#' @return dataframe containing tags with corresponding information.
#'
#' @export
evaluate_calibrations <- function(cal_data, eval_trueblank = TRUE,
true_blank = "TrueBlank",
excl_below_tb = TRUE,
eval_saturation = FALSE, eval_level = "top2",
excl_saturated = FALSE,
incl_plot = TRUE, path_files = "output/", ...) {
data_export = cal_data
if (eval_trueblank == TRUE) { #### TRUE-BLANK - $tag_noise---@param: true_blank
###### calc mean-val for true blank, unique values
tb_mean = subset(data_export, data_export$calcurve == true_blank)
tb_mean = ddply(tb_mean, c("nuc_id", "nuc_group" ,"transition_id"), transform,
n_tb_val = length(median_intensity),
mean_tb_val = mean(median_intensity),
sd_tb_val = sd(median_intensity))
tb_mean = unique(tb_mean[, c("nuc_id", "nuc_group" ,"transition_id", "mean_tb_val")])
###### merge: true-blank values and input data
data_values = merge(data_export, tb_mean)
data_values = subset(data_values, data_values$calcurve != "TrueBlank")
###### Evaluate $noise: below or above true blank
data_values$tag_noise = ifelse(data_values$median_intensity <= data_values$mean_tb_val,
"below_tb", "above_tb")
data_export = data_values
#end true blank
}
if (eval_saturation == TRUE) { #### SATURATION - $tag_saturation-- @param: saturation----------
##### select top2 levels and check for min of delta between mean-values(median_intensity)
data_sat = subset(data_export, data_export$level >= 5)
data_sat = ddply(data_sat, c("nuc_id", "nuc","nuc_group", "transition_id", "date" ,"level"), summarise,
n_int = length(median_intensity),
mean_int = mean(median_intensity),
sd_int = sd(median_intensity))
ref_level10 = subset(data_sat, data_sat$level == 10)
ref_level10 = unique(ref_level10[, c("nuc_id", "nuc" ,"date", "transition_id", "mean_int")])
colnames(ref_level10)[grepl("mean_int", colnames(ref_level10))] <- "ref_l10_int"
data_sat_m = merge(data_sat, ref_level10, all.x = TRUE)
data_sat_m$ratio_5 = data_sat_m$mean_int / data_sat_m$ref_l10_int
### small freq table
#freq_table = count(data_sat_m, vars = c("nuc"))
#end saturation
} else {
data_fin = data_export
}
if (incl_plot == TRUE) {
###### plotting ------------------ @param: exclude_below_tb -------------------
###### @param: generate_plot
data_plot = data_fin
### subsetting accord. to function call
### (1 - tag_noise)
if (excl_below_tb == TRUE) {
data_plot = subset(data_plot, data_plot$tag_noise != "below_tb")
message("Done - excluded values below true blank levels!")
}
### (2 - tag_saturation)
if (excl_saturated == TRUE) {
data_plot = subset(data_plot, data_plot$tag_saturation != "saturated")
message("Done - excluded saturated levels!")
}
for (var in unique(data_plot$nuc_group)) {
print(
ggplot(subset(data_plot, data_plot$nuc_group == var),
aes(x = level, y = median_intensity, colour = factor(date))) +
geom_point() +
facet_grid(nuc ~ transition_id, scales = "free_y") +
geom_hline(aes(yintercept = mean_tb_val), color = "grey", linetype = 2) +
theme_bw() +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 8)) +
ggtitle(paste0("Nucleotide (Calibration): ", var)) +
theme(strip.text = element_text(size = 8),
strip.background = element_rect(fill = "white")) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE),
breaks = NULL) +
scale_color_manual(values = c("dodgerblue3", "black", "red"))
)
}
#end plot
}
#end function
}
#' Graphical output of calibration curves
#'
#' This function generated decent ggplot2 graphics for the evaluation
#' of calibration curves. Depending on the presence of tags, e.g., tag_noise or
#' tag_saturation, plots visualise corresponding content.
#'
#' @param cal_data
#'
#' @return Plot object that can be printed
#'
#plot_calibrations <- function(cal_data, ...) {
#
#
#}