From 7d67449c8845efad60c7739147df897ed6e61a42 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Fri, 29 Nov 2024 23:21:04 -0500 Subject: [PATCH] enable FRAG support in callpeak, pileup and hmmratac --- ChangeLog | 10 ++++++++-- MACS3/Commands/callpeak_cmd.py | 4 ++-- MACS3/Commands/hmmratac_cmd.py | 7 +++++-- MACS3/Utilities/Constants.py | 2 +- MACS3/Utilities/OptValidator.py | 14 ++++++++++++-- bin/macs3 | 31 ++++++++++++++++++++----------- 6 files changed, 48 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7d00be76..97c94109 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,5 @@ -2024-10-08 Tao Liu - MACS 3.0.3b +2024-11-30 Tao Liu + MACS 3.0.3b1 * Features added @@ -27,6 +27,12 @@ alignment records that overlap with givin genomic coordinates using BAM/BAI files. + 2) `hmmratac`: wrong class name used while saving digested signals + in BedGraph files. + + 3) `predictd` and `filterdup`: wrong variable name used while + reading multiple pe/frag files. + * Doc 1) Explanation on the filtering criteria on SAM/BAM/BAMPE files. diff --git a/MACS3/Commands/callpeak_cmd.py b/MACS3/Commands/callpeak_cmd.py index 024f539e..00954226 100644 --- a/MACS3/Commands/callpeak_cmd.py +++ b/MACS3/Commands/callpeak_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-10-02 15:58:36 Tao Liu> +# Time-stamp: <2024-11-29 21:43:31 Tao Liu> """Description: MACS 3 call peak main executable @@ -57,7 +57,7 @@ def run(args): # 0 output arguments info("\n"+options.argtxt) - options.PE_MODE = options.format in ('BAMPE', 'BEDPE') + options.PE_MODE = options.format in ('BAMPE', 'BEDPE', 'FRAG') if options.PE_MODE: tag = 'fragment' # call things fragments not tags else: diff --git a/MACS3/Commands/hmmratac_cmd.py b/MACS3/Commands/hmmratac_cmd.py index 57ab2f4f..f4ed9999 100644 --- a/MACS3/Commands/hmmratac_cmd.py +++ b/MACS3/Commands/hmmratac_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-10-08 10:53:48 Tao Liu> +# Time-stamp: <2024-11-29 23:19:38 Tao Liu> """Description: Main HMMR command @@ -24,7 +24,7 @@ # from MACS3.Utilities.Constants import * from MACS3.Utilities.OptValidator import opt_validate_hmmratac from MACS3.IO.PeakIO import PeakIO -from MACS3.IO.Parser import BAMPEParser, BEDPEParser # BAMaccessor +from MACS3.IO.Parser import BAMPEParser, BEDPEParser, FragParser # BAMaccessor from MACS3.Signal.HMMR_EM import HMMR_EM from MACS3.Signal.HMMR_Signal_Processing import (generate_weight_mapping, generate_digested_signals, @@ -103,6 +103,9 @@ def run(args): elif options.format == "BEDPE": options.info("#1 Read fragments from BEDPE file...") parser = BEDPEParser + elif options.format == "FRAG": + options.info("#1 Read fragments from FRAG file...") + parser = FragParser else: raise Exception("wrong format") diff --git a/MACS3/Utilities/Constants.py b/MACS3/Utilities/Constants.py index 8e64282e..b9343b23 100644 --- a/MACS3/Utilities/Constants.py +++ b/MACS3/Utilities/Constants.py @@ -1,4 +1,4 @@ -MACS_VERSION = "3.0.3b" +MACS_VERSION = "3.0.3b1" MAX_PAIRNUM = 1000 MAX_LAMBDA = 100000 FESTEP = 20 diff --git a/MACS3/Utilities/OptValidator.py b/MACS3/Utilities/OptValidator.py index 451fe0b4..ae7d98eb 100644 --- a/MACS3/Utilities/OptValidator.py +++ b/MACS3/Utilities/OptValidator.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-10-02 19:47:03 Tao Liu> +# Time-stamp: <2024-11-29 22:11:31 Tao Liu> """Module Description This code is free software; you can redistribute it and/or modify it @@ -19,7 +19,9 @@ from MACS3.IO.Parser import (BEDParser, ELANDResultParser, ELANDMultiParser, ELANDExportParser, SAMParser, BAMParser, BAMPEParser, - BEDPEParser, BowtieParser, guess_parser) + BEDPEParser, BowtieParser, + FragParser, + guess_parser) from MACS3.Utilities.Constants import EFFECTIVEGS as efgsize @@ -85,6 +87,8 @@ def opt_validate_callpeak(options): options.nomodel = True elif options.format == "BOWTIE": options.parser = BowtieParser + elif options.format == "FRAG": + options.parser = FragParser elif options.format == "AUTO": options.parser = guess_parser else: @@ -96,6 +100,10 @@ def opt_validate_callpeak(options): if not options.keepduplicates.isdigit(): logger.error("--keep-dup should be 'auto', 'all' or an integer!") sys.exit(1) + # for duplicate reads filter, if format is FRAG, we turn it off by + # setting it as 'all' + if options.format == 'FRAG' and options.keepduplicates != "all": + logger.warning("Since the format is 'FRAG', `--keep-dup` will be set as 'all'.") if options.extsize < 1: logger.error("--extsize must >= 1!") @@ -528,6 +536,8 @@ def opt_validate_pileup(options): options.gzip_flag = True elif options.format == "BEDPE": options.parser = BEDPEParser + elif options.format == "FRAG": + options.parser = FragParser else: logger.error("Format \"%s\" cannot be recognized!" % (options.format)) sys.exit(1) diff --git a/bin/macs3 b/bin/macs3 index 0df21cf8..cfa34a96 100644 --- a/bin/macs3 +++ b/bin/macs3 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Time-stamp: <2024-10-02 12:29:58 Tao Liu> +# Time-stamp: <2024-11-29 21:39:49 Tao Liu> """Description: MACS v3 main executable. @@ -192,6 +192,10 @@ def add_callpeak_parser(subparsers): $ macs3 callpeak -f BAMPE -t ATAC.bam -g hs -n test -B -q 0.01 4. Peak calling on ATAC-seq (focusing on insertion sites, and using single-end mode): $ macs3 callpeak -f BAM -t ATAC.bam -g hs -n test -B -q 0.01 --shift -50 --extension 100 +5. Peak calling on scATAC-seq (paired-end mode): + $ macs3 callpeak -f FRAG -t scATAC.fragments.tsv.gz -g hs -n test -B -q 0.01 -n test +6. Peak calling on scATAC-seq (paired-end mode) and only for given barcodes: + $ macs3 callpeak -f FRAG -t scATAC.fragments.tsv.gz -g hs -n test -B -q 0.01 -n test --barcodes barcodes.txt """) # group for input files @@ -203,15 +207,17 @@ def add_callpeak_parser(subparsers): group_input.add_argument("-f", "--format", dest="format", type=str, choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", - "BAMPE", "BEDPE"), - help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS3 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"", + "BAMPE", "BEDPE", "FRAG"), + help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\" or \"FRAG\". The default AUTO option will let MACS decide which format (except for BAMPE, BEDPE, and FRAG which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE, BEDPE or FRAG, MACS3 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. The FRAG format is for single-cell ATAC-seq fragment file which is a five columns BEDPE file with extra barcode and fragment count column. DEFAULT: \"AUTO\"", default="AUTO") group_input.add_argument("-g", "--gsize", dest="gsize", type=str, default="hs", help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.") group_input.add_argument("-s", "--tsize", dest="tsize", type=int, default=None, help="Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set") group_input.add_argument("--keep-dup", dest="keepduplicates", type=str, default="1", - help="It controls the behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS3 to keep all by '--keep-dup all'. The default is to keep one tag at the same location. Default: 1") + help="It controls the behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS3 to keep all by '--keep-dup all'. If the format is FRAG, this option will be ignored and MACS3 will behave like '--keep-dup all'. The default is to keep one tag at the same location. Default: 1") + group_input.add_argument("--barcodes", dest="barcodefile", type=str, default="", + help="A plain text file containing the barcodes for the fragment file while the format is 'FRAG'. Won't have any effect if the fromat is not 'FRAG'. Each row in the file should only have the barcode string. MACS3 will extract only the fragments for the specified barcodes.") # group for output files group_output = argparser_callpeak.add_argument_group("Output arguments") @@ -314,7 +320,7 @@ def add_filterdup_parser(subparsers): help="Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.") argparser_filterdup.add_argument("-f", "--format", dest="format", type=str, choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE"), - help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"", + help="Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let MACS3 guess which format the file is. Please check the definition in README file for each specific format. DEFAULT: \"AUTO\"", default="AUTO") argparser_filterdup.add_argument("-g", "--gsize", dest="gsize", type=str, default="hs", help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.") @@ -336,6 +342,7 @@ def add_filterdup_parser(subparsers): help="When set, filterdup will only output numbers instead of writing output files, including maximum allowable duplicates, total number of reads before filtering, total number of reads after filtering, and redundant rate. Default: not set") return + def add_bdgpeakcall_parser(subparsers): """Add function 'peak calling on bedGraph' argument parsers. """ @@ -368,6 +375,7 @@ def add_bdgpeakcall_parser(subparsers): return + def add_bdgbroadcall_parser(subparsers): """Add function 'broad peak calling on bedGraph' argument parsers. """ @@ -394,6 +402,7 @@ def add_bdgbroadcall_parser(subparsers): add_output_group(argparser_bdgbroadcall) return + def add_bdgcmp_parser(subparsers): """Add function 'peak calling on bedGraph' argument parsers. """ @@ -422,6 +431,7 @@ def add_bdgcmp_parser(subparsers): help="Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m.") return + def add_bdgopt_parser(subparsers): """Add function 'operations on score column of bedGraph' argument parsers. """ @@ -526,6 +536,7 @@ def add_bdgdiff_parser(subparsers): return + def add_refinepeak_parser(subparsers): argparser_refinepeak = subparsers.add_parser("refinepeak", help="Take raw reads alignment, refine peak summits. Inspired by SPP.") @@ -549,7 +560,6 @@ def add_refinepeak_parser(subparsers): add_outdir_option(argparser_refinepeak) add_output_group(argparser_refinepeak) - return @@ -586,7 +596,6 @@ def add_predictd_parser(subparsers): argparser_predictd.add_argument("--verbose", dest="verbose", type=int, default=2, help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") - return @@ -599,8 +608,8 @@ def add_pileup_parser(subparsers): help="Output bedGraph file name. If not specified, will write to standard output. REQUIRED.") add_outdir_option(argparser_pileup) argparser_pileup.add_argument("-f", "--format", dest="format", type=str, - choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE"), - help="Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS3 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE or BEDPE, please specify it explicitly. Please note that when the format is BAMPE or BEDPE, the -B and --extsize options would be ignored.", + choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE", "FRAG"), + help="Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", \"BEDPE\", or \"FRAG\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS3 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE, BEDPE or FRAG, please specify it explicitly. Please note that when the format is BAMPE, BEDPE or FRAG, the -B and --extsize options would be ignored, and MACS3 will process the input in Paired-End mode.", default="AUTO") argparser_pileup.add_argument("-B", "--both-direction", dest="bothdirection", action="store_true", default=False, help="By default, any read will be extended towards downstream direction by extension size. So it's [0,size-1] (1-based index system) for plus strand read and [-size+1,0] for minus strand read where position 0 is 5' end of the aligned read. Default behavior can simulate MACS3 way of piling up ChIP sample reads where extension size is set as fragment size/d. If this option is set as on, aligned reads will be extended in both upstream and downstream directions by extension size. It means [-size,size] where 0 is the 5' end of a aligned read. It can partially simulate MACS3 way of piling up control reads. However MACS3 local bias is calculated by maximizing the expected pileup over a ChIP fragment size/d estimated from 10kb, 1kb, d and whole genome background. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: False") @@ -857,8 +866,8 @@ plus an extra option for the HMM model file like `macs3 hmmratac group_input.add_argument("-i", "--input", dest="input_file", type=str, required=True, nargs="+", help="Input files containing the aligment results for ATAC-seq paired end reads. If multiple files are given as '-t A B C', then they will all be read and pooled together. The file should be in BAMPE or BEDPE format (aligned in paired end mode). Files can be gzipped. Note: all files should be in the same format! REQUIRED.") group_input.add_argument("-f", "--format", dest="format", type=str, - choices=("BAMPE", "BEDPE"), - help="Format of input files, \"BAMPE\" or \"BEDPE\". If there are multiple files, they should be in the same format -- either BAMPE or BEDPE. Please check the definition in README. Also please note that the BEDPE only contains three columns -- chromosome, left position of the whole pair, right position of the whole pair-- and is NOT the same BEDPE format used by BEDTOOLS. To convert BAMPE to BEDPE, you can use this command `macs3 filterdup --keep-dup all -f BAMPE -i input.bam -o output.bedpe`. DEFAULT: \"BAMPE\"", + choices=("BAMPE", "BEDPE", "FRAG"), + help="Format of input files, \"BAMPE\", \"BEDPE\", or \"FRAG\". If there are multiple files, they should be in the same format -- either BAMPE, BEDPE or FRAG. Please check the definition in README. Also please note that the BEDPE only contains three columns -- chromosome, left position of the whole pair, right position of the whole pair-- and is NOT the same BEDPE format used by BEDTOOLS. To convert BAMPE to BEDPE, you can use this command `macs3 filterdup --keep-dup all -f BAMPE -i input.bam -o output.bedpe`. And the FRAG format is a five columns BEDPE with extra barcode and fragment count columns. DEFAULT: \"BAMPE\"", default="BAMPE") # group for output files