-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNuclAnalysis_fcn.R
281 lines (222 loc) · 9.48 KB
/
NuclAnalysis_fcn.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
### Functions for Nucleotide data processing
#' @title Import files
#'
#' @description \code{import_files} imports all files exported vie TOPPAS pipeline stored
#' at a defined input folder.
#' It also takes care of required modificiations of the input data, e.g., extraction
#' of file names and renaming column names.
#'
#' @param path_files Character.
#' @param mode Character; define either as "samples" or "cal" for calibration files
#' @param conv_list Data frame; containing conversion information, e.g., nuc_nb into nuc_id
#' @param condition_list Data frame: containing concentration level for each calibration curve,
#' required for the mode + "calibrations". Default value - NULL.
#' @param plot_graph TRUE / FALSE for the graphical output
#'
#' @return The function returns a dataframe summarising all input information provided in the
#' single input files
#'
#' @examples import_files("input/samples/", mode = "samples")
#' @examples import_files("input/cal/", mode = "cal")
#'
#'
import_files <- function(path_files, mode, true_blank, conv_list, condition_list, plot_graph = FALSE){
dataframe = list.files(path_files, pattern = ".unknown")
l = list()
temp = list()
for (i in 1:length(dataframe)) {
#import *.unknown file
data = read.table(paste0(path_files, dataframe[i]), header = TRUE)
#clean up header
colnames(data)[grepl("median", colnames(data))] = "median_intensity"
colnames(data)[grepl("nuc", colnames(data))] = "nuc_nb"
############## transformation for data == calibration ----------------------
if (mode == "cal") {
#extract: what NucleoMix
data$calcurve = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[4]
#extract: replicate calcurve
temp = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[5]
data$repl_calcurve = as.numeric(unlist(strsplit(temp, split = ".", fixed = TRUE))[1])
#extract: date
data$date = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[1]
#merge: nuc_nb matching up with nuc_id
data_conv = merge(data, conv_list)
#merge: calibration curve concentrations
data_conv = merge(data_conv, condition_list, all.x = TRUE)
#transform: transition (numeric) into factor()
data_conv = ddply(data_conv, c("nuc_id", "transition"), transform,
transition_id = paste0("t", transition))
#select dataframe columns
data_conv = data_conv[, c("nuc_id", "nuc","nuc_group" ,"transition_id", "calcurve", "level" ,"repl_calcurve",
"date", "median_intensity", "cv")]
}
############### transformation for data == samples -------------------------
if (mode == "samples") {
#extract:
data$file_tag = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[3]
data$date = unlist(strsplit(dataframe[i], split = "_", fixed = TRUE))[1]
#merge: sample group annotaion
data_conv = merge(data, conv_list, all.x = TRUE)
#merge: add annotation
data_conv = merge(data_conv, condition_list, all.x = TRUE)
#transform: transition (numeric) into factor()
data_conv = ddply(data_conv, c("nuc_id", "transition"), transform,
transition_id = paste0("t", transition))
#select dataframe columns
data_conv = data_conv[, c("date","file_tag", "sample", "nuc_id", "nuc", "transition_id",
"median_intensity", "cv")]
}
#check matching annotation file
if (nrow(data_conv) == 0) {
message("ERROR: Check annotation file. Empty dataframe created!")
}
#write list and transform into a dataframe
l[[i]] = data_conv
}
data_export = do.call(rbind, l)
#export into .csv file
temp_extr = unlist(strsplit(path_files, split = "/", fixed = TRUE))[2]
write.csv(data_export, paste0("output/", temp_extr, "_export.csv"), row.names = FALSE)
message("Done - input data exported as .csv-file: ",temp_extr)
if (plot_graph == TRUE & temp_extr == "calibrations") {
for (var in unique(data_export$nuc_group)) {
print(
ggplot(subset(data_export, data_export$nuc_group == var & data_export$calcurve != "TrueBlank"),
aes(x = level, y = median_intensity, colour = date)) +
geom_point() +
facet_grid(nuc~transition_id, scales = "free_y") +
theme_bw() +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 8)) +
ggtitle(paste0("Nucleotide (Calibration): ", var)) +
theme(strip.text = element_text(size = 8),
strip.background = element_rect(fill = "white")) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE),
breaks = NULL) +
scale_color_manual(values = c("dodgerblue3", "black", "red"))
)
}
}
}
if (plot_graph == TRUE & temp_extr == "samples") {
for (var in unique(data_export$nuc)) {
print(
ggplot(subset(data_export, data_export$nuc == var),
aes(sample, median_intensity, fill = date)) +
geom_boxplot() +
facet_wrap(~transition_id) +
theme_bw() +
coord_flip() +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 8)) +
ggtitle(paste0("Nucleotide (samples): ", var)) +
theme(strip.text = element_text(size = 8),
strip.background = element_rect(fill = "white")) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE),
breaks = NULL) +
scale_color_manual(values = c("dodgerblue3", "black", "red"))
)
}
}
return(data_export)
}
summarySE <- function(data = NULL, measurevar, groupvars = NULL, na.rm = FALSE,
conf.interval = .95, .drop = TRUE) {
#' @title Create statistics
#'
#' @description This functions creates the statistics for each nucleotide including
#' the mean and standard deviation.
#'
#' @param data
#' @param groupvars
#' @param measurevar
#' @param conf.interval
#' @param na.rm
#'
#' @return Question
#'
#'
#'
require(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function(x, na.rm = FALSE){
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop = .drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm = na.rm),
mean = mean(xx[[col]], na.rm = na.rm),
sd = sd(xx[[col]], na.rm = na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N - 1)
datac$ci <- datac$se * ciMult
return(datac)
}
qcurve = function(df){
#' @title Determine calibration curve
#'
#' @description
#'
#' @param df dataframe
#'
#here we get the r-squared for each linear regression curve
df = ddply(df,c("nuc"), transform, adj.r.squared = summary(lm(mean ~ level))$adj.r.squared)
#here we get the y-intercept for each linear regression curve
df = ddply(df,c("nuc"), transform, intercept = coefficients(lm(mean ~ level))[1])
#here we get the slope for each linear regression curve
df = ddply(df,c("nuc"), transform, slope = coefficients(lm(mean ~ level))[2])
#let's write these data into a file
write.table(df,"Calcurve_variables.tsv", row.names = F, col.names = T,
sep = "\t", quote = F)
#given above info, let's write it into our graphs
ggplot(df, aes(x = level, y = mean)) +
geom_point(size = 3) +
geom_smooth(method = "lm",se = FALSE) +
facet_wrap(~nuc, scales = 'free') +
theme_bw() +
ggtitle('Calibration curves (incl. regression coefficient)')+
scale_x_log10() +
scale_y_log10() +
geom_text(aes(label = paste0("r2=", format(round(adj.r.squared,3), nsmall = 3)), x = 400, y = 10000), size = 3) +
theme(legend.position = "none")
}
islinear = function(met, test){
#' @title Linearity check
#'
#' @description This function extracts the minimum and maximum value of each calibration curve
#' and validates the location of each measurement accordingly.
#'
#' @param met Metabolite name
#' @param test
#'
#met: metabolite name
#test: measured intensity of a sample
if(!all(is.na(test))){
met.ch = as.character(met)
curmin = min(qt$mean[qt$nuc == met.ch], na.rm = T) #min val of calibration
curmax = max(qt$mean[qt$nuc == met.ch], na.rm = T) #max val of calibration
answer = ifelse((test >= curmin & test <= curmax), 'linear', ifelse(test < curmin, 'below','above'))
} else {
answer = 'na'
}
}
absconc = function(nuc, mean){
if(!is.na(mean)){
intercept = qt$intercept[qt$nuc == as.character(nuc)][1]
slope = qt$slope[qt$nuc == as.character(nuc)][1]
y = (mean - intercept)/slope
} else {y = NA}
y
}