Skip to content

Commit

Permalink
first full draft
Browse files Browse the repository at this point in the history
  • Loading branch information
dpark01 committed Jan 14, 2025
1 parent 3bb3b4b commit 3590989
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 162 deletions.
202 changes: 77 additions & 125 deletions pipes/WDL/tasks/tasks_ncbi.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,61 @@ task structured_comments {
}
}
task structured_comments_from_coverage_report {
input {
File coverage_report
File assembly_fasta
String assembly_method
String assembly_method_version
String sequencing_tech
Boolean is_genome_assembly = true
String docker = "quay.io/broadinstitute/py3-bio:0.1.2"
}
String out_basename = basename(coverage_report, '.txt')
# see https://www.ncbi.nlm.nih.gov/genbank/structuredcomment/
command <<<
set -e
python3 << CODE
import Bio.SeqIO
import csv
# get list of sequence IDs in fasta
with open("~{assembly_fasta}", "rt") as inf:
seqids = set(seq.id for seq in Bio.SeqIO.parse(inf, "fasta"))
# this header has to be in this specific order -- don't reorder the columns!
out_headers = ('SeqID', 'StructuredCommentPrefix', 'Assembly Method', "~{true='Genome ' false='' is_genome_assembly}Coverage", 'Sequencing Technology', 'StructuredCommentSuffix')
with open("~{out_basename}.cmt", 'wt') as outf:
outf.write('\t'.join(out_headers)+'\n')
with open("~{coverage_report}", "rt") as inf:
for row in csv.DictReader(inf, delimiter='\t'):
if row.get('sample') and row.get('aln2self_cov_median') and row['sample'] in seqids:
outrow = {
'SeqID': row['sample'],
'Assembly Method': "~{assembly_method} v. ~{assembly_method_version}", # note: the <tool name> v. <version name> format is required by NCBI, don't remove the " v. "
'Sequencing Technology': "~{sequencing_tech}",
"~{true='Genome ' false='' is_genome_assembly}Coverage": "{}x".format(round(float(row['aln2self_cov_median']))),
'StructuredCommentPrefix': "~{true='Genome-' false='' is_genome_assembly}Assembly-Data",
'StructuredCommentSuffix': "~{true='Genome-' false='' is_genome_assembly}Assembly-Data",
}
outf.write('\t'.join(outrow[h] for h in out_headers)+'\n')
CODE
>>>
output {
File structured_comment_file = "~{out_basename}.cmt"
}
runtime {
docker: docker
memory: "1 GB"
cpu: 1
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
}
}
task prefix_fasta_header {
input {
File genome_fasta
Expand Down Expand Up @@ -865,7 +920,7 @@ task generate_author_sbt_file {
String? author_list
File j2_template
File? defaults_yaml
String? out_base = "authors"
String out_base = "authors"
String docker = "quay.io/broadinstitute/py3-bio:0.1.2"
}
Expand Down Expand Up @@ -1139,29 +1194,28 @@ task prepare_genbank {
}
}
task prepare_genbank_single {
task table2asn {
meta {
description: "this task runs NCBI's table2asn (unless the organism is flu/sc2/noro/dengue)"
description: "This task runs NCBI's table2asn, the artist formerly known as tbl2asn"
}
input {
File assembly_fasta
File annotations_tbl
File authors_sbt
String biosample_accession
File? source_modifier_table
File? coverage_table
String sequencingTech
File? structured_comment_file
String? comment
String organism
String molType
String assembly_method
String assembly_method_version
String mol_type = "cRNA"
Int genetic_code = 1
Int machine_mem_gb = 3
String docker = "quay.io/broadinstitute/viral-phylo:2.3.6.0"
}
String out_basename = basename(assembly_fasta, ".fasta")
parameter_meta {
assembly_fasta: {
description: "Assembled genome. All chromosomes/segments in one file.",
Expand All @@ -1175,147 +1229,45 @@ task prepare_genbank_single {
description: "A genbank submission template file (SBT) with the author list, created at https://submit.ncbi.nlm.nih.gov/genbank/template/submission/",
patterns: ["*.sbt"]
}
genbankSourceTable: {
source_modifier_table: {
description: "A tab-delimited text file containing requisite metadata for Genbank (a 'source modifier table'). https://www.ncbi.nlm.nih.gov/WebSub/html/help/genbank-source-table.html",
patterns: ["*.txt", "*.tsv"]
patterns: ["*.src"]
}
coverage_table: {
description: "A two column tab text file mapping sample IDs (first column) to average sequencing coverage (second column, floating point number).",
patterns: ["*.txt", "*.tsv"]
}
sequencingTech: {
description: "The type of sequencer used to generate reads. NCBI has a controlled vocabulary for this value which can be found here: https://submit.ncbi.nlm.nih.gov/structcomment/nongenomes/"
structured_comment_file: {
patterns: ["*.cmt"]
}
organism: {
description: "The scientific name for the organism being submitted. This is typically the species name and should match the name given by the NCBI Taxonomy database. For more info, see: https://www.ncbi.nlm.nih.gov/Sequin/sequin.hlp.html#Organism"
}
molType: {
description: "The type of molecule being described. Any value allowed by the INSDC controlled vocabulary may be used here. Valid values are described at http://www.insdc.org/controlled-vocabulary-moltype-qualifier"
}
assembly_method: {
description: "Very short description of the software approach used to assemble the genome. We typically provide a github link here. If this is specified, assembly_method_version should also be specified."
}
assembly_method_version: {
description: "The version of the software used. If this is specified, assembly_method should also be specified."
}
comment: {
description: "Optional comments that can be displayed in the COMMENT section of the Genbank record. This may include any disclaimers about assembly quality or notes about pre-publication availability or requests to discuss pre-publication use with authors."
}
}
command <<<
set -e
table2asn -version | cut -f 2 -d ' ' > TABLE2ASN_VERSION
cp "~{assembly_fasta}" "~{out_basename}.fsa" # input fasta must be in CWD so output files end up next to it
touch "~{out_basename}.val" # this file isn't produced if no errors/warnings
python3 << CODE
import ncbi
import util.file
import Bio.SeqIO
# process assembly fasta
fn = '~{assembly_fasta}'
if not fn.endswith('.fasta'):
raise Exception("fasta file must end in .fasta")
sample_base = os.path.basename(fn)[:-6]
with open(fn, "r") as inf:
seq_ids = list(seq.id for seq in Bio.SeqIO.parse(inf, 'fasta'))
n_segs = len(seq_ids)
# get coverage map per segment
coverage = {}
coverage_table_file = '~{default="" coverage_table}'
if coverage_table_file:
for row in util.file.read_tabfile_dict(coverage_table_file):
if row.get('sample') and row.get('aln2self_cov_median'):
coverage[row['sample']] = row['aln2self_cov_median']
# get biosample id map
biosample = {}
if biosample_map:
for row in util.file.read_tabfile_dict(biosample_map):
if row.get('sample') and row.get('BioSample'):
biosample[row['sample']] = row['BioSample']
# make output directory
util.file.mkdir_p(annotDir)
# for each segment/chromosome in the fasta file,
# create a separate new *.fsa file
with open(fn, "r") as inf:
asm_fasta = Bio.SeqIO.parse(inf, 'fasta')
for idx, seq_obj in enumerate(asm_fasta):
seq_obj.id
if n_segs>1:
sample = sample_base + "-" + str(idx+1)
else:
sample = sample_base
# make .cmt files
ncbi.make_structured_comment_file(os.path.join(annotDir, util.file.string_to_file_name(seq_obj.id) + '.cmt'),
name=seq_obj.id,
coverage=coverage.get(seq_obj.id, coverage.get(sample_base, coverage.get(util.file.string_to_file_name(seq_obj.id)))),
seq_tech=sequencing_tech,
assembly_method=assembly_method,
assembly_method_version=assembly_method_version)
CODE
cp "~{assembly_fasta}" .
IN_FASTA=$(basename "~{assembly_fasta}")
table2asn \
-t "~{authors_sbt}" \
-i "$IN_FASTA" \
-i "~{out_basename}.fsa" \
-f "~{annotations_tbl}" \
"~{'-src-file ' + source_modifier_table}" \
"~{'-w ' + structured_comment_file}" \
-j '[gcode=~{genetic_code}][moltype=~{mol_type}][organism=~{organism}]' \
~{'-src-file ' + source_modifier_table} \
~{'-y "' + comment + '"'} \
-a s -V vb
# need to convey coverage table / structured comment field, genetic code, organism, moltype, etc
touch special_args
if [ -n "~{comment}" ]; then
echo "--comment" >> special_args
echo "~{comment}" >> special_args
fi
echo "--sequencing_tech" >> special_args
echo "~{sequencingTech}" >> special_args
echo "--organism" >> special_args
echo "~{organism}" >> special_args
echo "--mol_type" >> special_args
echo "~{molType}" >> special_args
echo "--assembly_method" >> special_args
echo "~{assembly_method}" >> special_args
echo "--assembly_method_version" >> special_args
echo "~{assembly_method_version}" >> special_args
echo -e "sample\taln2self_cov_median" > coverage_table.txt
cat "~{coverage_table}" >> coverage_table.txt
echo "--coverage_table" >> special_args
echo coverage_table.txt >> special_args
for i in `cat "~{assembly_fasta}" | grep \> | cut -c 2-`; do
echo -e "~{biosample_accession}\t$i" >> biosample_map.txt
done
cat special_args | xargs -d '\n' ncbi.py prep_genbank_files \
"~{authors_sbt}" \
"~{assembly_fasta}" \
. \
--biosample_map biosample_map.txt \
--master_source_table "~{source_modifier_table}" \
--loglevel DEBUG
zip genbank_debug-all_files.zip *.sqn *.cmt *.gbf *.src *.fsa *.val
mv errorsummary.val errorsummary.val.txt # to keep it separate from the glob
>>>
output {
File debug_zip = "genbank_debug-all_files.zip"
File genbank_submission_sqn = glob("*.sqn")[0]
File genbank_preview_file = glob("*.gbf")[0]
File genbank_comment_file = glob("*.cmt")[0]
File errorSummary = "errorsummary.val.txt"
File genbank_submission_sqn = "~{out_basename}.sqn"
File genbank_preview_file = "~{out_basename}.gbf"
File genbank_validation_file = "~{out_basename}.val"
String table2asn_version = read_string("TABLE2ASN_VERSION")
}
Expand Down
58 changes: 21 additions & 37 deletions pipes/WDL/workflows/genbank_single.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,9 @@ workflow genbank_single {
File aligned_reads_bam
String biosample_accession
Int tax_id
String organism_name
String assembly_method
String assembly_method_version

String email_address # required for fetching data from NCBI APIs
File authors_sbt
File? biosample_attributes_tsv # if empty, we will fetch from NCBI via accession
String? comment
String molType='cRNA'
}

parameter_meta {
Expand All @@ -45,14 +39,6 @@ workflow genbank_single {
description: "A post-submission attributes file from NCBI BioSample, which is available at https://submit.ncbi.nlm.nih.gov/subs/ and clicking on 'Download attributes file with BioSample accessions'.",
patterns: ["*.txt", "*.tsv"]
}
molType: {
description: "The type of molecule being described. This defaults to 'cRNA' as this pipeline is most commonly used for viral submissions, but any value allowed by the INSDC controlled vocabulary may be used here. Valid values are described at http://www.insdc.org/controlled-vocabulary-moltype-qualifier",
category: "common"
}
comment: {
description: "Optional comments that can be displayed in the COMMENT section of the Genbank record. This may include any disclaimers about assembly quality or notes about pre-publication availability or requests to discuss pre-publication use with authors."
}

}

# fetch biosample metadata from NCBI if it's not given to us in tsv form
Expand All @@ -72,18 +58,10 @@ workflow genbank_single {
}
call reports.coverage_report {
input:
mapped_bams = [aligned_reads_bam]
mapped_bams = [aligned_reads_bam],
out_report_name = "coverage_report-~{basename(aligned_reads_bam, '.bam')}.txt"
}

# create genbank source modifier table from biosample metadata
call ncbi.biosample_to_genbank {
input:
biosample_attributes = biosample_attributes,
num_segments = length(reference_accessions),
taxid = tax_id
}

# Is this a special virus that NCBI handles differently?
call ncbi.genbank_special_taxa {
input:
Expand All @@ -106,6 +84,13 @@ workflow genbank_single {
combined_out_prefix = segment_acc
}
}
# create genbank source modifier table from biosample metadata
call ncbi.biosample_to_genbank {
input:
biosample_attributes = biosample_attributes,
num_segments = length(string_split.tokens),
taxid = tax_id
}
## naive liftover of gene coordinates by alignment
call ncbi.align_and_annot_transfer_single as annot {
input:
Expand All @@ -124,29 +109,28 @@ workflow genbank_single {
}
File feature_tbl = select_first([vadr.feature_tbl, annot.feature_tbl])

call ncbi.structured_comments_from_coverage_report {
input:
coverage_report = coverage_report.coverage_report,
assembly_fasta = assembly_fasta,
sequencing_tech = sequencing_platform_from_bam.genbank_sequencing_technology
}

if(genbank_special_taxa.table2asn_allowed) {
call ncbi.prepare_genbank_single as prep_genbank {
call ncbi.table2asn {
input:
assembly_fasta = assembly_fasta,
annotations_tbl = feature_tbl,
authors_sbt = authors_sbt,
biosample_accession = biosample_accession,
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
coverage_table = coverage_report.coverage_report,
sequencingTech = sequencing_platform_from_bam.genbank_sequencing_technology,
comment = comment,
organism = organism_name,
molType = molType,
assembly_method = assembly_method,
assembly_method_version = assembly_method_version
structured_comment_file = structured_comments_from_coverage_report.structured_comment_file
}
}

output {
File? genbank_submission_sqn = prep_genbank.genbank_submission_sqn
File? genbank_preview_file = prep_genbank.genbank_preview_file
File? genbank_comment_file = prep_genbank.genbank_comment_file
File? table2asn_errors = prep_genbank.errorSummary
File? genbank_submission_sqn = table2asn.genbank_submission_sqn
File? genbank_preview_file = table2asn.genbank_preview_file
File? table2asn_errors = table2asn.genbank_validation_file
File genbank_comment_file = structured_comments_from_coverage_report.structured_comment_file
Boolean? vadr_pass = vadr.pass

File genbank_source_table = biosample_to_genbank.genbank_source_modifier_table
Expand Down

0 comments on commit 3590989

Please sign in to comment.