Merge pull request #8 from gustaveroussy/dev

Dev to masta : v0.3.3-1
gustaveroussy · Oct 9, 2018 · 021f859 · 021f859
2 parents 8961d31 + 82d7975
commit 021f859
Show file tree

Hide file tree

Showing 12 changed files with 647 additions and 56 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,8 +2,8 @@ Encoding: UTF-8
 Package: EaCoN
 Type: Package
 Title: EaCoN : Easy Copy Number !
-Version: 0.3.3
-Date: 2018-09-11
+Version: 0.3.3-1
+Date: 2018-09-18
 Author: Bastien Job
 Maintainer: Bastien JOB <[email protected]>
 Depends: R(>= 3.1.0)

diff --git a/NEWS b/NEWS
@@ -1,9 +1,20 @@
 EaCoN
 -----
 
+v0.3.3-1 (20181002) *LittleWomanNoCry*
+-----------------
+* BUG : Segment.SEQUENZA() : added imputation of NA values in L2R object that made copynumber::aspcf() unable to work (happened with microarrays for flagged probes, not WES).
+* BUG : Segment.SEQUENZA() : BAF filtering wasn't working properly, resulting in wrong BAF segmentation, for all microarrays.
+* BUG : CS.Process.Batch() : wrong variable name in header check.
+* CORR : OS.Process() : corrected wrong handling of sex.chr output was forced as c("X", "Y") instead of variable, default c("chrX", "chrY").
+* CORR : Segment.FACETS() Segment.SEQUENZA() : Added missing meta 'BAF.filter' in the object.
+* CORR : README.md : fixed few links to dependencies, corrected default regex.
+* MOD : Segment.* : Changed the structure of the profile PNG filename to "[samplename].SEG.[segmenter].png" (to ease the use of regex for further steps in batch mode).
+* MOD : ASCN.ff.Batch() Annotate.ff.Batch() : corrected the default regex.
+
 v0.3.3 (20180911) *Trinity*
 -----------------
-* NEW : SEQUENZA segmentation plainly implemented, for both L2R+BAF bivariate segmentation [Segment.SEQUENZA()] AND copy number estimation [ASCN.SEQUENZA()].
+* NEW : SEQUENZA segmentation plainly implemented, for both L2R+BAF bivariate segmentation Segment.SEQUENZA() AND copy number estimation ASCN.SEQUENZA().
 * BUG : Segment.ff() : Corrected wrong do.call() call (parameters not given as a list).
 * CORR : ASCN.ASCAT() : CN output file was badly formatted.
 * MOD : ASCN.ff() : Suppressed the "segmenter" parameter, which is read from the RDS meta$eacon$segmenter value.
@@ -49,13 +60,13 @@ v0.3.0 (20180724) *PapoQueen*
 * All : Removed "EaCoN." prefix from most functions (less self-centric...)
 * All : Took care of vectors and columns that could be converted to factor or integer (to free some RAM up).
 * All : Added missing support for manual PELT penalty (only asymptotic mode was considered when SER.value was numeric).
-* SNP6 : Revamped BAF homozygous calling and rescaling.
+* SNP6 : Revamped BAF homozygous calling and rescaling. 
 * Defined the novel sets of default parameters for all supported technologies.
 * Redacted the README.md
 
 v0.2.13 (20180531) *SunIsBack*
 ------------------
-* WES : Added more data to the BIN  RDS output (counts with the reference genome nucleotide for both test and ref BAMs). This is in order to 1) filter out on a minimum alternative allele count 2) allow the use of other segmenters that do not rely on BAF but rather on AD (like PSCBS) or logOR (like FACETS).
+* WES : Added more data to the BIN RDS output (counts with the reference genome nucleotide for both test and ref BAMs). This is in order to 1) filter out on a minimum alternative allele count 2) allow the use of other segmenters that do not rely on BAF but rather on AD (like PSCBS) or logOR (like FACETS).
 * Modified the subthreading scheme for EaCoN.WES.Bin() : now each subthread has its own connection to the BAM files. This allows each thread to work fully (but increases simultaneous IO).
 * Now each SNP variant has its corresponding bin index, which will allow to perform density-based selection like in FACETS.
 

diff --git a/R/EaCoN_functions.R b/R/EaCoN_functions.R
diff --git a/R/apt_cytoscan_process.R b/R/apt_cytoscan_process.R
@@ -27,7 +27,7 @@ CS.Process <- function(CEL = NULL, samplename = NULL, dual.norm = FALSE, normal.
   # require(foreach)
   # source("~/git_gustaveroussy/EaCoN/R/mini_functions.R")
   # source("~/git_gustaveroussy/EaCoN/R/renorm_functions.R")
-  # source("~/git_gustaveroussy/EaCoN/R/germline_functions.R")
+
 
 
   ## Early checks
@@ -384,8 +384,9 @@ CS.Process.Batch <- function(CEL.list.file = NULL, nthread = 1, cluster.type = "
   if (!file.exists(CEL.list.file)) stop("Could not find CEL.list.file !")
   message("Reading and checking CEL.list.file ...")
   myCELs <- read.table(file = CEL.list.file, header = TRUE, sep="\t", check.names = FALSE, as.is = TRUE)
-  head.ok <- c("cel_files", "SampleName")
-  head.chk <- all(colnames(CEL.list.file) == head.ok)
+  head.ok <- c("CEL", "SampleName")
+  head.chk <- all(colnames(myCELs) == head.ok)
+
   if (!head.chk) {
     message("Invalid header in CEL.list.file !")
     message(paste0("EXPECTED : ", head.ok))
@@ -398,9 +399,10 @@ CS.Process.Batch <- function(CEL.list.file = NULL, nthread = 1, cluster.type = "
     message(myCELs$SampleName[which(duplicated(myCELs$SampleName))])
     stop("Duplicated SampleNames.")
   }
-  fecheck <- !vapply(myCELs$cel_files, file.exists, TRUE)
+
+  fecheck <- !vapply(myCELs$CEL, file.exists, TRUE)
   fecheck.pos <- which(fecheck)
-  if (length(fecheck.pos) > 0) stop(paste0("\n", "CEL file could not be found : ", myCELs$cel_files[fecheck.pos], collapse = ""))
+  if (length(fecheck.pos) > 0) stop(paste0("\n", "CEL file could not be found : ", myCELs$CEL[fecheck.pos], collapse = ""))
 
   message(paste0("Found ", nrow(myCELs), " samples to process."))
 
@@ -421,7 +423,7 @@ CS.Process.Batch <- function(CEL.list.file = NULL, nthread = 1, cluster.type = "
   p <- 0
   csres <- foreach::foreach(p = seq_len(nrow(myCELs)), .inorder = FALSE, .errorhandling = "pass") %dopar% {
     EaCoN.set.bitmapType(type = current.bitmapType)
-    CS.Process(CEL = myCELs$cel_files[p], samplename = myCELs$SampleName[p], ...)
+    CS.Process(CEL = myCELs$CEL[p], samplename = myCELs$SampleName[p], ...)
   }
 
   ## Stopping cluster

diff --git a/R/apt_oncoscan_process.R b/R/apt_oncoscan_process.R
@@ -263,7 +263,7 @@ OS.Process <- function(ATChannelCel = NULL, GCChannelCel = NULL, samplename = NU
       chrs = unique(ao.df$chr),
       samples = samplename,
       gender = as.vector(meta.b$predicted.gender),
-      sexchromosomes = c("X", "Y"),
+      sexchromosomes = sex.chr,
       failedarrays = NULL
     ),
     meta = list(

diff --git a/README.md b/README.md
@@ -344,10 +344,10 @@ As for the **WES.Normalize.ff.Batch** function, the **Segment.ff.Batch** functio
 Here is a synthetic example that will segment our CytoScan HD samples (as defined by the _pattern_ below) using ASCAT :
 
 ```R
-Segment.ff.Batch(RDS.files = list.files(path = getwd(), pattern = "_CSHD.*_processed.RDS$", full.names = TRUE, recursive = TRUE), segmenter = "ASCAT", smooth.k = 5, SER.pen = 20, nrf = 1.0, nthread = 2)
+Segment.ff.Batch(RDS.files = list.files(path = getwd(), pattern = ".*_processed.RDS$", full.names = TRUE, recursive = TRUE), segmenter = "ASCAT", smooth.k = 5, SER.pen = 20, nrf = 1.0, nthread = 2)
 ```
 
-- To perform the same using the **FACETS** segmenter, just change the value of the _segmenter_ parameter !
+- To perform the same using the **FACETS** segmenter, just change the value of the _segmenter_ parameter, but **please remember that FACETS will only work with WES data !**
 
 - I suppose you guessed how to do the same with **SEQUENZA**, right ? ;)
 
@@ -356,17 +356,22 @@ Segment.ff.Batch(RDS.files = list.files(path = getwd(), pattern = "_CSHD.*_proce
 Still the same, with the **ASCN.ff.Batch** :
 
 ```R
-ASCN.ff.Batch(RDS.files = list.files(path = getwd(), pattern = "_CSHD.*_EaCoN.ASPCF.RDS$", full.names = TRUE, recursive = TRUE), nthread = 2)
+ASCN.ff.Batch(RDS.files = list.files(path = getwd(), pattern = "SEG\\.ASCAT\\.RDS$", full.names = TRUE, recursive = TRUE), nthread = 2)
 ```
 
+- To perform the same using results obtained using the **FACETS** or **SEQUENZA** segmenter, just edit the _pattern_ argument with the name of corresponding segmenter.
+
+
 #### **HTML reporting**
 
 And here again with the **Annotate.ff.Batch** :
 
 ```R
-Annotate.ff.Batch(RDS.files = list.files(path = getwd(), pattern = "_CSHD.*_EaCoN.ASPCF.RDS$", full.names = TRUE, recursive = TRUE), author.name = "Me!")
+Annotate.ff.Batch(RDS.files = list.files(path = getwd(), pattern = "SEG\\.ASCAT\\.RDS$", full.names = TRUE, recursive = TRUE), author.name = "Me!")
 ```
 
+- To perform the same using results obtained using the **FACETS** or **SEQUENZA** segmenter, just edit the _pattern_ argument with the name of corresponding segmenter.
+
 ### **Piped**
 
 EaCoN has been implemented in a way that one can also choose to launch the full workflow in a single command line for a single sample, using pipes from the [magrittr](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) package. However, this is not recommended as default use : even though EaCoN is provided with recommandations that should fit most case, the user may have to deal with particular profiles that would require parameter tweaking, which is not possible in piped mode...
@@ -393,7 +398,7 @@ OS.Process(ATChannelCel = "/home/me/my_project/CEL/SAMPLE1_OncoScan_CNV_A.CEL",
 
 ## **GUIDELINES**
 
-### **Segmentation using ASCAT**
+### **Segmentation**
 
 - For each step, default values for each data source already correspond to recommendations. However, for the common **segmentation** step using the ASCAT segmenter, adaptation to the data source is recommended, by changing few parameters :
 

diff --git a/inst/extdata/html_report.Rmd b/inst/extdata/html_report.Rmd
@@ -21,7 +21,7 @@ always_allow_html: yes
 
 ```{r setup, echo = FALSE, include = FALSE}
 `%>%` <- magrittr::"%>%"
-show.flag <- if ((data$meta$basic$source == "microarray") & (data$meta$basic$manufacturer == "Affymetrix")) TRUE else FALSE
+# show.flag <- if ((data$meta$basic$source == "microarray") & (data$meta$basic$manufacturer == "Affymetrix")) TRUE else FALSE
 knitr::opts_knit$set(base.dir = tempdir())
 ```
 
@@ -36,6 +36,12 @@ DT::datatable(data = array.df, rownames = FALSE, caption = "", class = "cell-bor
 cat("<HR><HR><BR><BR>\n")
 ```
 
+```{r wes_info, results = "asis", echo = FALSE, eval = as.logical(!show.flag)}
+cat('# WES Data Information\n')
+## Insert table here !
+cat("<HR><HR><BR><BR>\n")
+```
+
 <!-- *** -->
 <!-- *** -->
 
@@ -47,6 +53,12 @@ cat(paste0("\n![](", intplotf, ")\n"))
 cat("<HR><HR><BR><BR>\n")
 ```
 
+```{r covplot, results = "asis", fig.height = 10, fig.width = 10, fig.align="center", echo = FALSE, eval = FALSE}
+cat("# Coverage Plot\n")
+cat(paste0("\n![](", covplotf, ")\n"))
+cat("<HR><HR><BR><BR>\n")
+```
+
 <!-- *** -->
 <!-- *** -->
 

diff --git a/man/ASCN.ff.Batch.Rd b/man/ASCN.ff.Batch.Rd
@@ -3,7 +3,7 @@
 \title{Allele-Specific Copy Number estimation, from RDS files in batch mode, with multithreading.}
 \usage{
   ASCN.ff.Batch(RDS.files = list.files(path = getwd(),
-  pattern = ".EaCoN.ASPCF.RDS$", full.names = TRUE, recursive = TRUE,
+  pattern = "SEG\\.ASCAT\\.RDS$", full.names = TRUE, recursive = TRUE,
   ignore.case = TRUE, include.dirs = FALSE), nthread = 1,
   cluster.type = "PSOCK", ...)
 }

diff --git a/man/Annotate.ff.Batch.Rd b/man/Annotate.ff.Batch.Rd
@@ -6,7 +6,7 @@
 }
 \usage{
   Annotate.ff.Batch(RDS.files = list.files(path = getwd(),
-  pattern = ".EaCoN.ASPCF.RDS$", full.names = TRUE, recursive = TRUE,
+  pattern = "\\.SEG\\.ASCAT\\.RDS$", full.names = TRUE, recursive = TRUE,
   ignore.case = TRUE, include.dirs = FALSE), nthread = 1,
   cluster.type = "PSOCK", ...)
 }

diff --git a/man/CS.Process.Batch.Rd b/man/CS.Process.Batch.Rd
@@ -16,7 +16,7 @@
 \details{
   \code{CEL.list.file} is a tab-separated text file containing 2 columns (header and specified column names are mandatory) :
     \itemize{
-      \item{cel_files : Name (and path) of the CEL file(s)}
+      \item{CEL : Name (and path) of the CEL file(s)}
       \item{SampleName : The output sample name(s)}
     }
 }

diff --git a/man/Segment.FACETS.Rd b/man/Segment.FACETS.Rd
@@ -2,7 +2,7 @@
 \alias{Segment.FACETS}
 \title{L2R and BAF joint segmentation using FACETS.}
 \usage{
-  Segment.FACETS(data = NULL, smooth.k = NULL, BAF.filter = .9, homoCut = .05,
+  Segment.FACETS(data = NULL, smooth.k = NULL, BAF.filter = .75, homoCut = .05,
   FACETS.pen = 150, recenter = "l2r.centeredpeak", calling.method = "mad",
   nrf = .5, SER.pen = 2, out.dir = getwd(), return.data = FALSE,
   write.data = TRUE, plot = TRUE, force = FALSE)

diff --git a/man/Segment.SEQUENZA.Rd b/man/Segment.SEQUENZA.Rd
@@ -2,9 +2,9 @@
 \alias{Segment.SEQUENZA}
 \title{L2R and BAF joint segmentation using SEQUENZA.}
 \usage{
-  Segment.SEQUENZA(data = NULL, smooth.k = NULL, BAF.filter = .9, homoCut = .05,
+  Segment.SEQUENZA(data = NULL, smooth.k = NULL, BAF.filter = .75, homoCut = .05,
   SEQUENZA.pen = 50, recenter = "l2r.centeredpeak", calling.method = "mad",
-  nrf = .5, SER.pen = 2, out.dir = getwd(), return.data = FALSE,
+  nrf = .5, SER.pen = 40, out.dir = getwd(), return.data = FALSE,
   write.data = TRUE, plot = TRUE, force = FALSE)
 }
 \arguments{