Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add workflow to download multiple SRA accessions to multiple bams: fetch_multiple_sra_to_bams #537

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
5 changes: 5 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ workflows:
primaryDescriptorPath: /pipes/WDL/workflows/fetch_sra_to_bam.wdl
testParameterFiles:
- /empty.json
- name: fetch_multiple_sra_to_bams
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/fetch_multiple_sra_to_bams.wdl
testParameterFiles:
- /empty.json
- name: filter_classified_bam_to_taxa
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/filter_classified_bam_to_taxa.wdl
Expand Down
5 changes: 3 additions & 2 deletions pipes/WDL/tasks/tasks_ncbi_tools.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ task Fetch_SRA_to_BAM {
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/ncbi-tools:2.10.7.10"
}
Int disk_size = 750
Int disk_size = 6000
meta {
description: "This searches NCBI SRA for accessions using the Entrez interface, collects associated metadata, and returns read sets as unaligned BAM files with metadata loaded in. Useful metadata from BioSample is also output from this task directly. This has been tested with both SRA and ENA accessions. This queries the NCBI production database, and as such, the output of this task is non-deterministic given the same input."
volatile: true
Expand All @@ -26,7 +26,8 @@ task Fetch_SRA_to_BAM {
MODEL=$(jq -r ".EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.PLATFORM.$PLATFORM.INSTRUMENT_MODEL" SRA.json)
SAMPLE=$(jq -r '.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SAMPLE.IDENTIFIERS.EXTERNAL_ID|select(.namespace == "BioSample")|.content' SRA.json)
LIBRARY=$(jq -r .EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.alias SRA.json)
RUNDATE=$(jq -r '.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.RUN_SET.RUN.SRAFiles|if (.SRAFile|type) == "object" then .SRAFile.date else [.SRAFile[]|select(.supertype == "Original")][0].date end' SRA.json | cut -f 1 -d ' ')
# if there are multiple runs, select the one matching the SRA accession specified in the task input
RUNDATE=$(jq -r '(.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.RUN_SET | (if (.RUN|type) == "object" then (.RUN) else (.RUN[] | select(any(.; .accession == "~{SRA_ID}"))) end) | .SRAFiles) | if (.SRAFile|type) == "object" then .SRAFile.date else [.SRAFile[]|select(.supertype == "Original" or .supertype=="Primary ETL")][0].date end' SRA.json | cut -f 1 -d ' ')

if [[ -n "~{sample_name}" ]]; then
SAMPLE="~{sample_name}"
Expand Down
103 changes: 103 additions & 0 deletions pipes/WDL/workflows/fetch_multiple_sra_to_bams.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
version 1.0

import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools
import "../tasks/tasks_utils.wdl" as utils

workflow fetch_multiple_sra_to_bams {
meta {
description: "Retrieve reads for multiple SRA run IDs from the NCBI Short Read Archive in unaligned BAM format (multiple bam files) with relevant metadata encoded."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

input {
Array[String]+ SRA_IDs
}

parameter_meta {
SRA_IDs: {
description: "SRA run accessions (ex. *RR#######), NOT SRA study or sample accessions."
}
}

scatter(sra_id in SRA_IDs) {
call ncbi_tools.Fetch_SRA_to_BAM as scattered_fetch_sra_to_bam {
input:SRA_ID = sra_id
}

Map[String,String] sra_outputs_map = {
"reads_ubam": scattered_fetch_sra_to_bam.reads_ubam,
"sequencing_center": scattered_fetch_sra_to_bam.sequencing_center,
"sequencing_platform": scattered_fetch_sra_to_bam.sequencing_platform,
"sequencing_platform_model": scattered_fetch_sra_to_bam.sequencing_platform_model,
"biosample_accession": scattered_fetch_sra_to_bam.biosample_accession,
"library_id": scattered_fetch_sra_to_bam.library_id,
"run_date": scattered_fetch_sra_to_bam.run_date,
"sample_collection_date": scattered_fetch_sra_to_bam.sample_collection_date,
"sample_collected_by": scattered_fetch_sra_to_bam.sample_collected_by,
"sample_strain": scattered_fetch_sra_to_bam.sample_strain,
"sample_geo_loc": scattered_fetch_sra_to_bam.sample_geo_loc,
"sra_metadata": scattered_fetch_sra_to_bam.sra_metadata
}

Array[String] metadata_for_accession = [
sra_id,
scattered_fetch_sra_to_bam.reads_ubam,
scattered_fetch_sra_to_bam.sequencing_center,
scattered_fetch_sra_to_bam.sequencing_platform,
scattered_fetch_sra_to_bam.sequencing_platform_model,
scattered_fetch_sra_to_bam.biosample_accession,
scattered_fetch_sra_to_bam.library_id,
scattered_fetch_sra_to_bam.run_date,
scattered_fetch_sra_to_bam.sample_collection_date,
scattered_fetch_sra_to_bam.sample_collected_by,
scattered_fetch_sra_to_bam.sample_strain,
scattered_fetch_sra_to_bam.sample_geo_loc,
scattered_fetch_sra_to_bam.sra_metadata
]

String sra_accession = sra_id
}

# create mapping from input SRA_ID to corresponding map of k:v containing metadata
scatter(paired_metadata in zip(sra_accession, sra_outputs_map)){
Map[String,Map[String,String]] combined_output_map = {
paired_metadata.left: paired_metadata.right
}
}

Array[String] metadata_header = [
"sra_run_accession",
"reads_ubam",
"sequencing_center",
"sequencing_platform",
"sequencing_platform_model",
"biosample_accession",
"library_id",
"run_date",
"sample_collection_date",
"sample_collected_by",
"sample_strain",
"sample_geo_loc",
"sra_metadata"
]

#String input_ids_string = sep('_',SRA_IDs) # WDL >=1.1 (join all specified IDs)
String input_ids_string = flatten([SRA_IDs])[0] # WDL 1.0 (just use the first ID)

call utils.concatenate as combined_metadata {
input:
# note that metadata_for_accession has type Array[Array[String]] since it is plural gathered scatter output
infiles = [write_tsv([metadata_header]), write_tsv(metadata_for_accession)],
output_name = "run_metadata-${input_ids_string}.tsv"
}

output {
# bam files for requested SRA IDs
Array[File] read_bams = scattered_fetch_sra_to_bam.reads_ubam

Array[ Map[ String, Map[String,String] ] ] collected_sra_metadata = combined_output_map
File collected_sra_metadata_tsv = combined_metadata.combined
}
}
Loading